## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned articles file and save in dataframe

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

### Check columns info in articles dataframe

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42220 entries, 0 to 42219
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42220 non-null  object
 1   title            42220 non-null  object
 2   author           42220 non-null  object
 3   url              42220 non-null  object
 4   claps            42220 non-null  int64 
 5   responses        42220 non-null  int64 
 6   reading_time     42220 non-null  int64 
 7   paid             42220 non-null  int64 
 8   content          42220 non-null  object
 9   cleaned_content  42219 non-null  object
 10  cleaned_author   42220 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.5+ MB


## Data Preprocessing

### Define stopwords that appear in both articles and jobs listings

In [4]:
# append special stopwords in articles to default and common stopwords in Gensim library
new_stopwords = STOPWORDS.union(set(['data', 'use', 'like', 'ability', 'let', 'example',
                'need', 'new', 'user', 'provide', 'one', 'used', 'need', 
                'see', 'make', 'follow', 'going', 'will', 'want', 'well', 'find', 
                'give', 'change', 'look', 'first', 'using',
                'know', 'model', 'science', 'think', 'looking', 'problem', 'column', 
                'vallue', 'understand', 'take', 'problem', 'information', 'scientist', 
                'might', 'add', 'now', 'many', 'might', 'column', 'value', 'create',
                'result', 'case', 'article', 'set', 'feature', 'function', 'learning']))

### Method to preprocess data for articles

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the articles content
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [8]:
# Convert into the bag-of-words (BoW) format
articles_bow_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [9]:
# Convert into TF-IDF format
articles_tfidf = models.TfidfModel(articles_bow_corpus)
articles_tfidf_corpus = articles_tfidf[articles_bow_corpus]

## Common method to find top 10 dominant topics in the articles

In [10]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Title"] = cleaned_articles_df['title']
    dominant_topics_df["Text"] = cleaned_articles_df['cleaned_content']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
articles_bow_lsamodel = LsiModel(articles_bow_corpus, num_topics=10, id2word = articles_dictionary)

In [12]:
articles_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.210*"time" + 0.184*"image" + 0.157*"network" + 0.152*"number" + 0.137*"training" + 0.136*"different" + 0.135*"code"'),
 (1,
  '-0.583*"image" + -0.349*"network" + -0.313*"layer" + -0.169*"neural" + -0.157*"input" + -0.146*"training" + 0.139*"time"'),
 (2,
  '0.297*"image" + -0.242*"variable" + -0.229*"distribution" + -0.198*"probability" + -0.163*"value" + 0.159*"project" + 0.159*"file"'),
 (3,
  '0.739*"word" + -0.235*"image" + 0.202*"vector" + 0.189*"text" + -0.170*"variable" + 0.140*"sentence" + 0.133*"language"'),
 (4,
  '0.298*"network" + -0.286*"code" + -0.246*"file" + -0.230*"image" + -0.225*"word" + -0.223*"python" + 0.158*"neural"'),
 (5,
  '0.490*"image" + -0.289*"network" + -0.235*"code" + -0.221*"layer" + 0.211*"word" + -0.202*"python" + -0.171*"file"'),
 (6,
  '-0.260*"model" + 0.259*"distribution" + -0.251*"feature" + -0.245*"machine" + 0.244*"probability" + 0.235*"state" + -0.215*"dataset"'),
 (7,
  '0.397*"state" + 0.314*"algorithm" + 0.304*"action" + 0.242*"a

<font color = "blue">
    Topic 0: Code for image detection and time series <br/>
    Topic 1: Image Recognition/Classification using neural network <br/>
    Topic 2: Probability Distribution <br/>
    Topic 3: NLP word vector <br/>
    Topic 4: Neural network using Python <br/>
    Topic 5: 
</font>

In [13]:
get_dominant_topics(articles_bow_lsamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
39097,3.0,193.9922,"word, image, vector, text, variable, sentence,...",What a CEO needs to know about Machine Learnin...,during my first project in mckinsey in 2011 i...
28350,3.0,192.6825,"word, image, vector, text, variable, sentence,...",Python Dictionary from Scratch!!!,dictionary in python comprises an unordered c...
25444,0.0,176.3965,"time, image, network, number, training, differ...",[Paper Summary] Deep Tree Learning for Zero-Sh...,typically one creates an algorithm or build n...
13770,0.0,171.3049,"time, image, network, number, training, differ...",Introduction to IBM Federated Learning: A Coll...,ibm research has just released ibm federated ...
16907,0.0,155.6723,"time, image, network, number, training, differ...",Fraud detection — Unsupervised Anomaly Detection,one of the greatest concerns of many business...
25147,0.0,154.7452,"time, image, network, number, training, differ...",Retrieving OpenStreetMap data in Python,if you want to retrieve geospatial data from ...
30101,3.0,153.1593,"word, image, vector, text, variable, sentence,...",Evasion attacks on Machine Learning (or “Adver...,machine learning is exciting however just lik...
35905,0.0,137.9255,"time, image, network, number, training, differ...",(PersonLab) Single-shot fully-convolutional ar...,picking up where we left from we are going to...
15365,7.0,136.631,"state, algorithm, action, agent, layer, reward...",Introducing GeneAl: a Genetic Algorithm Python...,genetic algorithms ga are an optimization and...
30755,0.0,131.3757,"time, image, network, number, training, differ...",Machines that learn by doing,in my midtwenties i learned to play tennis fo...


In [14]:
cleaned_articles_df['cleaned_content'][39097]

' during my first project in mckinsey in 2011 i served the ceo of a bank regarding his small business strategy i wanted to run a linear regression on the banks data but my boss told me dont do it they dont understand statistics we did not use machine learning but 7 years down the road i still believe we developed the right strategy artificial intelligence is the most generalpurpose technology of our time new products and processes are being developed thanks to better vision systems speech recognition technologies or recommendation engines based on machine learning in fact most recent advances in artificial intelligence have been achieved in the area of machine learning long before mckinsey in 2004 i started my career as a mobile software developer at that time i had to write precise instructions for every step of my code developing the voice recognition system of todays phones would have been tedious and errorprone back then it would have required literally hundreds of thousands of det

### Result using LSA model + TF-IDF

In [15]:
articles_tfidf_lsamodel = LsiModel(articles_tfidf_corpus, num_topics=10, id2word = articles_dictionary)

In [16]:
articles_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.142*"image" + 0.116*"network" + 0.112*"layer" + 0.093*"training" + 0.089*"word" + 0.083*"dataset" + 0.082*"variable"'),
 (1,
  '-0.322*"layer" + -0.288*"image" + -0.236*"network" + -0.166*"neural" + 0.139*"business" + -0.125*"weight" + 0.125*"company"'),
 (2,
  '0.309*"image" + -0.227*"regression" + -0.198*"variable" + 0.173*"layer" + -0.160*"distribution" + -0.150*"tree" + -0.141*"probability"'),
 (3,
  '-0.227*"file" + -0.157*"panda" + 0.145*"network" + 0.143*"business" + -0.139*"dataframe" + -0.137*"python" + -0.123*"command"'),
 (4,
  '-0.533*"word" + -0.249*"sentence" + -0.237*"text" + -0.228*"vector" + 0.222*"image" + -0.179*"document" + -0.151*"sentiment"'),
 (5,
  '-0.390*"image" + 0.269*"agent" + 0.224*"reward" + -0.185*"cluster" + 0.176*"action" + 0.168*"state" + 0.163*"gradient"'),
 (6,
  '0.383*"cluster" + 0.326*"agent" + 0.273*"reward" + 0.206*"action" + 0.205*"image" + -0.179*"layer" + 0.177*"clustering"'),
 (7,
  '0.383*"tree" + 0.242*"node" + -0.208*"distribut

In [17]:
get_dominant_topics(articles_tfidf_lsamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
8910,6.0,0.5269,"cluster, agent, reward, action, image, layer, ...",Top 3 Books to Kickstart Your Machine Learning...,machine learning isnt as hard as it used to b...
17747,6.0,0.5171,"cluster, agent, reward, action, image, layer, ...",How to start learning reinforcement learning (RL),build and reflect theres a ton of resources f...
33731,6.0,0.5155,"cluster, agent, reward, action, image, layer, ...",All you need to know about text preprocessing ...,based on some recent conversations i realized...
7120,6.0,0.5036,"cluster, agent, reward, action, image, layer, ...",Applications of Linear Programming Problem (LPP),true optimization is the revolutionary contri...
24908,6.0,0.495,"cluster, agent, reward, action, image, layer, ...",Decoding the performance secret of world’s mos...,no machine learning script can start without ...
20322,6.0,0.4936,"cluster, agent, reward, action, image, layer, ...",Causal Inference via CausalImpact,wikipedia defines it as the process of drawin...
30534,6.0,0.4904,"cluster, agent, reward, action, image, layer, ...",Sentiment Analysis : Simplified,sentiment analysis is a field of natural lang...
23496,6.0,0.4899,"cluster, agent, reward, action, image, layer, ...",Linear Regression and its assumptions,last week i was helping my friend to prepare ...
36461,6.0,0.4873,"cluster, agent, reward, action, image, layer, ...",“ TIME SERIES ANALYSIS USING NEURAL NETWORK ”,welcome to the fourth episode of fastdotai wh...
3891,6.0,0.4841,"cluster, agent, reward, action, image, layer, ...",How to create an AI that plays tick tac toe wi...,in this article we will create two agents who...


## LDA Model

### Result using LDA model + Bag of words

In [18]:
articles_bow_ldamodel = gensim.models.LdaMulticore(articles_bow_corpus, num_topics=10, id2word=articles_dictionary)

In [19]:
articles_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.008*"image" + 0.008*"word" + 0.007*"code" + 0.006*"time" + 0.005*"network" + 0.004*"layer" + 0.004*"text"'),
 (1,
  '0.005*"image" + 0.005*"time" + 0.005*"number" + 0.004*"customer" + 0.004*"different" + 0.004*"training" + 0.004*"feature"'),
 (2,
  '0.005*"machine" + 0.005*"time" + 0.004*"different" + 0.004*"project" + 0.004*"network" + 0.004*"process" + 0.003*"word"'),
 (3,
  '0.005*"machine" + 0.005*"model" + 0.004*"number" + 0.004*"time" + 0.004*"team" + 0.004*"let" + 0.004*"work"'),
 (4,
  '0.006*"time" + 0.005*"point" + 0.005*"algorithm" + 0.005*"machine" + 0.005*"number" + 0.004*"test" + 0.004*"mean"'),
 (5,
  '0.005*"time" + 0.004*"let" + 0.004*"list" + 0.004*"number" + 0.004*"value" + 0.004*"work" + 0.003*"machine"'),
 (6,
  '0.006*"code" + 0.005*"time" + 0.005*"method" + 0.005*"let" + 0.004*"dataset" + 0.004*"python" + 0.004*"machine"'),
 (7,
  '0.007*"time" + 0.005*"number" + 0.004*"code" + 0.004*"network" + 0.004*"training" + 0.004*"model" + 0.004*"work"'),
 (8,
  

In [20]:
pyLDAvis.gensim_models.prepare(articles_bow_ldamodel, articles_bow_corpus, articles_dictionary)

In [21]:
get_dominant_topics(articles_bow_ldamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
15368,4.0,0.9997,"time, point, algorithm, machine, number, test,...",Crystal Clear Reinforcement Learning,reinforcement learning rl is the hottest fiel...
39100,0.0,0.9996,"image, word, code, time, network, layer, text,...",A hands-on intuitive approach to Deep Learning...,working with unstructured text data is hard e...
1327,1.0,0.9996,"image, time, number, customer, different, trai...",Super Resolution: Adobe Photoshop versus Leadi...,how effective is adobes super resolution comp...
24483,2.0,0.9996,"machine, time, different, project, network, pr...",AI Strategy in EU Towards 2020,the eu investment in ethical ai is coordinate...
438,4.0,0.9996,"time, point, algorithm, machine, number, test,...",Tackling the exploration-exploitation trade-off,this post was cowritten with baptiste rocca s...
22183,4.0,0.9995,"time, point, algorithm, machine, number, test,...",Student’s t-test in R and by hand: how to comp...,one of the most important test within the bra...
26257,2.0,0.9995,"machine, time, different, project, network, pr...",National AI Strategies and the Climate Crisis,we could look into the strategies of the five...
5439,2.0,0.9995,"machine, time, different, project, network, pr...",21 Predictions about the Software Development ...,cloud edge container quantum blockchain ai de...
3217,4.0,0.9995,"time, point, algorithm, machine, number, test,...",Markov Decision Processes and Bellman Equations,in the first part of this series on reinforce...
10977,2.0,0.9995,"machine, time, different, project, network, pr...",Why are eye movements so damned interesting?,my first rotation in graduate school was in d...


### Result using LDA model + TF-IDF

In [22]:
articles_tfidf_ldamodel = gensim.models.LdaMulticore(articles_tfidf_corpus, num_topics=10, id2word=articles_dictionary)

In [23]:
articles_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"variable" + 0.001*"image" + 0.001*"dataset" + 0.001*"distribution" + 0.001*"plot" + 0.001*"value" + 0.001*"project"'),
 (1,
  '0.002*"image" + 0.002*"network" + 0.001*"layer" + 0.001*"training" + 0.001*"algorithm" + 0.001*"neural" + 0.001*"business"'),
 (2,
  '0.001*"word" + 0.001*"python" + 0.001*"variable" + 0.001*"tweet" + 0.001*"plot" + 0.001*"dataset" + 0.001*"network"'),
 (3,
  '0.001*"image" + 0.001*"file" + 0.001*"python" + 0.001*"spark" + 0.001*"tree" + 0.001*"code" + 0.001*"table"'),
 (4,
  '0.001*"probability" + 0.001*"image" + 0.001*"distribution" + 0.000*"variable" + 0.000*"python" + 0.000*"dataset" + 0.000*"file"'),
 (5,
  '0.001*"image" + 0.001*"business" + 0.001*"model" + 0.001*"training" + 0.001*"company" + 0.001*"dataset" + 0.001*"network"'),
 (6,
  '0.001*"image" + 0.001*"variable" + 0.001*"distribution" + 0.001*"test" + 0.001*"probability" + 0.001*"regression" + 0.001*"network"'),
 (7,
  '0.001*"image" + 0.001*"file" + 0.001*"python" + 0.001*"word" + 

In [24]:
pyLDAvis.gensim_models.prepare(articles_tfidf_ldamodel, articles_tfidf_corpus, articles_dictionary)

In [25]:
get_dominant_topics(articles_tfidf_ldamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
7242,1.0,0.9638,"image, network, layer, training, algorithm, ne...",The case against investing in machine learning...,the word on the street is if you dont invest ...
34386,1.0,0.962,"image, network, layer, training, algorithm, ne...","Notes on Artificial Intelligence (AI), Machine...",ai has been the most intriguing topic of 2018...
5311,1.0,0.9619,"image, network, layer, training, algorithm, ne...",12 Steps For Beginner To Pro In Data Science I...,so you are a data science enthusiast and want...
35773,1.0,0.9616,"image, network, layer, training, algorithm, ne...",An Experimental Development Process for Making...,its really hard to build product features and...
21391,1.0,0.9613,"image, network, layer, training, algorithm, ne...",The Real AI Crisis,some thought leaders such as elon musk and th...
32333,1.0,0.9613,"image, network, layer, training, algorithm, ne...","A New Data Scientist’s Reflections on Culture,...",it was another sunny spring 2018 day in burba...
9015,1.0,0.9612,"image, network, layer, training, algorithm, ne...",120+ Data Scientist Interview Questions and An...,2020 wasnt the greatest year so i thought why...
30469,1.0,0.9611,"image, network, layer, training, algorithm, ne...",Next Level Art and the Future of Work and Leisure,the fact that ai and deep learning have had a...
34896,1.0,0.9602,"image, network, layer, training, algorithm, ne...",Artificial Intelligence Demystified,ai is this years buzzword of choice across th...
15844,1.0,0.9601,"image, network, layer, training, algorithm, ne...",Data Science in a Post Crisis World,source johns hokpins covid dashboard a decade...
