## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned articles file and save in dataframe

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

### Check columns info in articles dataframe

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42220 entries, 0 to 42219
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42220 non-null  object
 1   title            42220 non-null  object
 2   author           42220 non-null  object
 3   url              42220 non-null  object
 4   claps            42220 non-null  int64 
 5   responses        42220 non-null  int64 
 6   reading_time     42220 non-null  int64 
 7   paid             42220 non-null  int64 
 8   content          42220 non-null  object
 9   cleaned_content  42219 non-null  object
 10  cleaned_author   42220 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.5+ MB


## Data Preprocessing

### Define stopwords that appear in both articles and jobs listings

In [4]:
# append special stopwords in articles to default and common stopwords in Gensim library
new_stopwords = STOPWORDS.union(set(['data', 'science', 'model', 'learning', 'value', 'state', 'action', 'function', 
                                     'think', 'thing', 'column', 'policy', 'state']))

### Method to preprocess data for articles

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the articles content
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [8]:
# Convert into the bag-of-words (BoW) format
articles_bow_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [9]:
# Convert into TF-IDF format
articles_tfidf = models.TfidfModel(articles_bow_corpus)
articles_tfidf_corpus = articles_tfidf[articles_bow_corpus]

## Common method to find top 10 dominant topics in the articles

In [14]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Text"] = cleaned_articles_df['cleaned_content']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
articles_bow_lsamodel = LsiModel(articles_bow_corpus, num_topics=10, id2word = articles_dictionary)

In [12]:
articles_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.215*"learning" + 0.202*"value" + 0.186*"function" + 0.177*"time" + 0.154*"feature" + 0.153*"like" + 0.148*"image"'),
 (1,
  '-0.518*"image" + -0.342*"network" + -0.304*"layer" + -0.170*"neural" + -0.158*"training" + -0.158*"input" + 0.134*"time"'),
 (2,
  '-0.384*"value" + -0.355*"function" + 0.319*"learning" + -0.231*"variable" + 0.199*"image" + 0.186*"machine" + -0.161*"distribution"'),
 (3,
  '-0.312*"learning" + 0.301*"image" + -0.274*"feature" + 0.272*"code" + 0.266*"function" + 0.250*"file" + 0.194*"python"'),
 (4,
  '-0.768*"word" + -0.231*"vector" + 0.195*"image" + -0.192*"text" + 0.147*"learning" + -0.143*"sentence" + -0.127*"document"'),
 (5,
  '-0.445*"feature" + 0.431*"function" + -0.385*"image" + 0.288*"learning" + 0.205*"network" + -0.154*"dataset" + 0.138*"neural"'),
 (6,
  '-0.391*"feature" + -0.291*"learning" + 0.278*"image" + -0.203*"machine" + 0.199*"distribution" + -0.192*"python" + -0.185*"code"'),
 (7,
  '0.368*"feature" + -0.317*"image" + -0.315*"learni

In [15]:
get_dominant_topics(articles_bow_lsamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
25444,0.0,237.6149,"learning, value, function, time, feature, like...",typically one creates an algorithm or build n...
13770,0.0,229.6903,"learning, value, function, time, feature, like...",ibm research has just released ibm federated ...
25147,0.0,170.1253,"learning, value, function, time, feature, like...",if you want to retrieve geospatial data from ...
35905,0.0,167.5791,"learning, value, function, time, feature, like...",picking up where we left from we are going to...
16907,0.0,159.037,"learning, value, function, time, feature, like...",one of the greatest concerns of many business...
16704,0.0,155.0214,"learning, value, function, time, feature, like...",apache kafka is a streaming platform that all...
2698,9.0,153.7958,"function, think, feature, like, thing, time, n...",gpt3 generative pretrained transformer 3 is a...
30755,0.0,153.3528,"learning, value, function, time, feature, like...",in my midtwenties i learned to play tennis fo...
6115,0.0,147.3596,"learning, value, function, time, feature, like...",1 ai as a research field started in 1956 by b...
5363,9.0,142.0312,"function, think, feature, like, thing, time, n...",human genes names are mistaken as dates limit...


### Result using LSA model + TF-IDF

In [16]:
articles_tfidf_lsamodel = LsiModel(articles_tfidf_corpus, num_topics=10, id2word = articles_dictionary)

In [17]:
articles_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.137*"image" + 0.112*"network" + 0.108*"layer" + 0.102*"feature" + 0.096*"function" + 0.092*"learning" + 0.090*"training"'),
 (1,
  '-0.309*"layer" + -0.277*"image" + -0.227*"network" + -0.161*"neural" + 0.140*"business" + 0.137*"scientist" + 0.126*"company"'),
 (2,
  '0.311*"image" + -0.214*"regression" + -0.202*"variable" + 0.194*"layer" + 0.157*"network" + -0.151*"distribution" + -0.142*"tree"'),
 (3,
  '-0.234*"file" + -0.213*"column" + -0.163*"panda" + -0.144*"dataframe" + -0.138*"python" + -0.127*"command" + 0.126*"business"'),
 (4,
  '-0.539*"word" + -0.252*"sentence" + -0.245*"text" + -0.228*"vector" + 0.210*"image" + -0.182*"document" + -0.156*"sentiment"'),
 (5,
  '-0.390*"image" + 0.215*"agent" + -0.203*"cluster" + 0.184*"function" + 0.183*"reward" + 0.164*"gradient" + 0.146*"state"'),
 (6,
  '0.334*"tree" + 0.221*"node" + -0.211*"image" + -0.159*"column" + -0.156*"distribution" + 0.143*"cluster" + -0.136*"plot"'),
 (7,
  '-0.535*"cluster" + -0.259*"agent" + -0.239*

In [18]:
get_dominant_topics(articles_tfidf_lsamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
8910,9.0,0.501,"agent, cluster, reward, action, tree, game, po...",machine learning isnt as hard as it used to b...
30534,9.0,0.4805,"agent, cluster, reward, action, tree, game, po...",sentiment analysis is a field of natural lang...
37707,9.0,0.4784,"agent, cluster, reward, action, tree, game, po...",recently ive been playing around with a multi...
7120,9.0,0.4767,"agent, cluster, reward, action, tree, game, po...",true optimization is the revolutionary contri...
33731,9.0,0.4758,"agent, cluster, reward, action, tree, game, po...",based on some recent conversations i realized...
17747,9.0,0.4722,"agent, cluster, reward, action, tree, game, po...",build and reflect theres a ton of resources f...
37556,9.0,0.471,"agent, cluster, reward, action, tree, game, po...",machine learning opportunities can be sparse ...
3891,9.0,0.4694,"agent, cluster, reward, action, tree, game, po...",in this article we will create two agents who...
23496,9.0,0.4687,"agent, cluster, reward, action, tree, game, po...",last week i was helping my friend to prepare ...
34629,9.0,0.4681,"agent, cluster, reward, action, tree, game, po...",this article describes my machine learning pr...


## LDA Model

### Result using LDA model + Bag of words

In [19]:
articles_bow_ldamodel = gensim.models.LdaMulticore(articles_bow_corpus, num_topics=10, id2word=articles_dictionary)

In [20]:
articles_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.007*"time" + 0.007*"like" + 0.005*"learning" + 0.005*"value" + 0.005*"need" + 0.004*"article" + 0.004*"user"'),
 (1,
  '0.007*"learning" + 0.006*"value" + 0.006*"time" + 0.005*"like" + 0.005*"model" + 0.004*"function" + 0.004*"different"'),
 (2,
  '0.006*"learning" + 0.006*"time" + 0.005*"like" + 0.005*"need" + 0.004*"function" + 0.004*"company" + 0.004*"different"'),
 (3,
  '0.009*"image" + 0.008*"learning" + 0.006*"value" + 0.006*"machine" + 0.005*"network" + 0.005*"number" + 0.005*"different"'),
 (4,
  '0.016*"image" + 0.006*"layer" + 0.006*"network" + 0.006*"time" + 0.005*"like" + 0.004*"training" + 0.004*"learning"'),
 (5,
  '0.006*"function" + 0.006*"need" + 0.005*"learning" + 0.005*"value" + 0.005*"example" + 0.005*"like" + 0.004*"time"'),
 (6,
  '0.008*"value" + 0.006*"need" + 0.006*"time" + 0.005*"function" + 0.005*"example" + 0.004*"code" + 0.004*"number"'),
 (7,
  '0.007*"time" + 0.007*"distribution" + 0.005*"value" + 0.005*"need" + 0.004*"like" + 0.004*"feature" +

In [21]:
pyLDAvis.gensim_models.prepare(articles_bow_ldamodel, articles_bow_corpus, articles_dictionary)

In [22]:
get_dominant_topics(articles_bow_ldamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
4356,1.0,0.9998,"learning, value, time, like, model, function, ...",i recently built a projection model for the 2...
34037,4.0,0.9997,"image, layer, network, time, like, training, l...",if youve heard of different kinds of convolut...
874,7.0,0.9997,"time, distribution, value, need, like, feature...",it was summer 2019 i looked at a cloudy sky i...
23298,2.0,0.9996,"learning, time, like, need, function, company,...",ninetyfive percent of csuite executives list ...
2002,7.0,0.9996,"time, distribution, value, need, like, feature...",we have little to no control over the actual ...
20094,4.0,0.9996,"image, layer, network, time, like, training, l...",in this blog post we present a formal treatme...
39187,2.0,0.9996,"learning, time, like, need, function, company,...",technology has eroded our privacy protections...
25986,8.0,0.9996,"function, need, python, value, feature, code, ...",the world runs on data and everyone should kn...
1327,4.0,0.9996,"image, layer, network, time, like, training, l...",how effective is adobes super resolution comp...
32737,2.0,0.9995,"learning, time, like, need, function, company,...",ai is in full gold rush mode every day we hea...


### Result using LDA model + TF-IDF

In [23]:
articles_tfidf_ldamodel = gensim.models.LdaMulticore(articles_tfidf_corpus, num_topics=10, id2word=articles_dictionary)

In [24]:
articles_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"probability" + 0.001*"variable" + 0.001*"feature" + 0.001*"function" + 0.001*"regression" + 0.001*"plot" + 0.001*"image"'),
 (1,
  '0.001*"business" + 0.001*"learning" + 0.001*"scientist" + 0.001*"network" + 0.001*"company" + 0.001*"image" + 0.001*"machine"'),
 (2,
  '0.001*"variable" + 0.001*"function" + 0.001*"regression" + 0.001*"feature" + 0.001*"value" + 0.001*"dataset" + 0.001*"image"'),
 (3,
  '0.002*"image" + 0.002*"word" + 0.002*"layer" + 0.002*"function" + 0.002*"column" + 0.001*"network" + 0.001*"vector"'),
 (4,
  '0.003*"image" + 0.002*"layer" + 0.001*"network" + 0.001*"function" + 0.001*"convolution" + 0.001*"object" + 0.001*"input"'),
 (5,
  '0.001*"image" + 0.001*"file" + 0.001*"function" + 0.001*"layer" + 0.001*"feature" + 0.001*"word" + 0.001*"network"'),
 (6,
  '0.001*"word" + 0.001*"feature" + 0.001*"scientist" + 0.001*"project" + 0.001*"function" + 0.001*"dataset" + 0.001*"learning"'),
 (7,
  '0.001*"feature" + 0.001*"column" + 0.001*"tree" + 0.001*"f

In [25]:
pyLDAvis.gensim_models.prepare(articles_tfidf_ldamodel, articles_tfidf_corpus, articles_dictionary)

In [26]:
get_dominant_topics(articles_tfidf_ldamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
7242,1.0,0.9644,"business, learning, scientist, network, compan...",the word on the street is if you dont invest ...
35521,1.0,0.9625,"business, learning, scientist, network, compan...",neurips is a great conference attracting the ...
34386,1.0,0.9624,"business, learning, scientist, network, compan...",ai has been the most intriguing topic of 2018...
32333,1.0,0.9622,"business, learning, scientist, network, compan...",it was another sunny spring 2018 day in burba...
9015,1.0,0.9621,"business, learning, scientist, network, compan...",2020 wasnt the greatest year so i thought why...
5311,1.0,0.9621,"business, learning, scientist, network, compan...",so you are a data science enthusiast and want...
30469,1.0,0.9618,"business, learning, scientist, network, compan...",the fact that ai and deep learning have had a...
21391,1.0,0.9616,"business, learning, scientist, network, compan...",some thought leaders such as elon musk and th...
35773,1.0,0.9613,"business, learning, scientist, network, compan...",its really hard to build product features and...
7805,1.0,0.9612,"business, learning, scientist, network, compan...",look we both know you have like eight other m...
