## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned articles file and save in dataframe

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

### Check columns info in articles dataframe

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42220 entries, 0 to 42219
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42220 non-null  object
 1   title            42220 non-null  object
 2   author           42220 non-null  object
 3   url              42220 non-null  object
 4   claps            42220 non-null  int64 
 5   responses        42220 non-null  int64 
 6   reading_time     42220 non-null  int64 
 7   paid             42220 non-null  int64 
 8   content          42220 non-null  object
 9   cleaned_content  42219 non-null  object
 10  cleaned_author   42220 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.5+ MB


## Data Preprocessing

### Define stopwords that appear in both articles and jobs listings

In [4]:
# append special stopwords in articles to default and common stopwords in Gensim library
new_stopwords = STOPWORDS.union(set(['data', 'use', 'like', 'ability', 'let', 'example',
                'need', 'new', 'user', 'provide', 'one', 'used', 'need', 
                'see', 'make', 'follow', 'going', 'will', 'want', 'well', 'find', 
                'give', 'change', 'look', 'first', 'using',
                'know', 'model', 'science', 'think', 'looking', 'problem', 'column', 
                'vallue', 'understand', 'take', 'problem', 'information', 'scientist', 
                'might', 'add', 'now', 'many', 'might', 'column', 'value', 'create',
                'result', 'case', 'article', 'set']))

### Method to preprocess data for articles

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the articles content
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [8]:
# Convert into the bag-of-words (BoW) format
articles_bow_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [9]:
# Convert into TF-IDF format
articles_tfidf = models.TfidfModel(articles_bow_corpus)
articles_tfidf_corpus = articles_tfidf[articles_bow_corpus]

## Common method to find top 10 dominant topics in the articles

In [10]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Text"] = cleaned_articles_df['cleaned_content']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
articles_bow_lsamodel = LsiModel(articles_bow_corpus, num_topics=10, id2word = articles_dictionary)

In [12]:
articles_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.243*"learning" + 0.203*"function" + 0.193*"time" + 0.173*"feature" + 0.171*"image" + 0.152*"network" + 0.141*"number"'),
 (1,
  '-0.546*"image" + -0.339*"network" + -0.307*"layer" + -0.166*"neural" + 0.155*"time" + -0.154*"input" + -0.141*"training"'),
 (2,
  '-0.433*"function" + 0.385*"learning" + -0.238*"variable" + 0.222*"machine" + -0.176*"distribution" + 0.175*"image" + -0.168*"value"'),
 (3,
  '-0.323*"feature" + 0.302*"image" + 0.291*"code" + 0.268*"function" + 0.256*"file" + -0.226*"word" + 0.220*"python"'),
 (4,
  '0.733*"word" + -0.244*"feature" + 0.213*"vector" + 0.193*"text" + 0.138*"sentence" + -0.135*"image" + -0.132*"learning"'),
 (5,
  '0.420*"image" + -0.404*"function" + 0.368*"feature" + -0.346*"learning" + -0.212*"network" + 0.178*"word" + -0.142*"neural"'),
 (6,
  '-0.422*"feature" + 0.258*"image" + -0.246*"learning" + 0.214*"distribution" + -0.197*"python" + -0.195*"code" + 0.194*"probability"'),
 (7,
  '0.344*"feature" + -0.342*"learning" + -0.309*"image

In [13]:
get_dominant_topics(articles_bow_lsamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
25444,0.0,216.3245,"learning, function, time, feature, image, netw...",typically one creates an algorithm or build n...
13770,0.0,213.2259,"learning, function, time, feature, image, netw...",ibm research has just released ibm federated ...
39097,4.0,186.8111,"word, feature, vector, text, sentence, image, ...",during my first project in mckinsey in 2011 i...
28350,4.0,181.7809,"word, feature, vector, text, sentence, image, ...",dictionary in python comprises an unordered c...
35905,0.0,172.0735,"learning, function, time, feature, image, netw...",picking up where we left from we are going to...
16907,0.0,167.2599,"learning, function, time, feature, image, netw...",one of the greatest concerns of many business...
25147,0.0,156.1927,"learning, function, time, feature, image, netw...",if you want to retrieve geospatial data from ...
30101,4.0,154.1383,"word, feature, vector, text, sentence, image, ...",machine learning is exciting however just lik...
30755,0.0,141.2564,"learning, function, time, feature, image, netw...",in my midtwenties i learned to play tennis fo...
16704,0.0,139.9802,"learning, function, time, feature, image, netw...",apache kafka is a streaming platform that all...


### Result using LSA model + TF-IDF

In [14]:
articles_tfidf_lsamodel = LsiModel(articles_tfidf_corpus, num_topics=10, id2word = articles_dictionary)

In [15]:
articles_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.141*"image" + 0.115*"network" + 0.111*"layer" + 0.104*"feature" + 0.098*"function" + 0.094*"learning" + 0.092*"training"'),
 (1,
  '-0.314*"layer" + -0.279*"image" + -0.229*"network" + -0.162*"neural" + 0.141*"business" + 0.127*"company" + -0.124*"weight"'),
 (2,
  '0.314*"image" + -0.223*"regression" + -0.200*"variable" + 0.183*"layer" + -0.157*"distribution" + -0.149*"tree" + 0.143*"network"'),
 (3,
  '-0.234*"file" + -0.159*"panda" + -0.141*"python" + -0.140*"dataframe" + 0.140*"network" + 0.136*"business" + -0.127*"command"'),
 (4,
  '-0.537*"word" + -0.251*"sentence" + -0.240*"text" + -0.228*"vector" + 0.213*"image" + -0.179*"document" + -0.153*"sentiment"'),
 (5,
  '-0.393*"image" + 0.230*"agent" + 0.196*"reward" + -0.186*"cluster" + 0.183*"function" + 0.169*"gradient" + 0.153*"action"'),
 (6,
  '0.415*"cluster" + 0.327*"agent" + 0.273*"reward" + 0.206*"action" + 0.189*"clustering" + 0.177*"policy" + 0.176*"state"'),
 (7,
  '0.363*"tree" + 0.229*"node" + -0.214*"image" 

In [16]:
get_dominant_topics(articles_tfidf_lsamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
8910,6.0,0.5298,"cluster, agent, reward, action, clustering, po...",machine learning isnt as hard as it used to b...
17747,6.0,0.5192,"cluster, agent, reward, action, clustering, po...",build and reflect theres a ton of resources f...
33731,6.0,0.5189,"cluster, agent, reward, action, clustering, po...",based on some recent conversations i realized...
7120,6.0,0.5059,"cluster, agent, reward, action, clustering, po...",true optimization is the revolutionary contri...
20322,6.0,0.4988,"cluster, agent, reward, action, clustering, po...",wikipedia defines it as the process of drawin...
24908,6.0,0.4973,"cluster, agent, reward, action, clustering, po...",no machine learning script can start without ...
30534,6.0,0.4919,"cluster, agent, reward, action, clustering, po...",sentiment analysis is a field of natural lang...
23496,6.0,0.4914,"cluster, agent, reward, action, clustering, po...",last week i was helping my friend to prepare ...
37135,6.0,0.489,"cluster, agent, reward, action, clustering, po...",neural networks are a ground breaking technol...
36461,6.0,0.4889,"cluster, agent, reward, action, clustering, po...",welcome to the fourth episode of fastdotai wh...


## LDA Model

### Result using LDA model + Bag of words

In [17]:
articles_bow_ldamodel = gensim.models.LdaMulticore(articles_bow_corpus, num_topics=10, id2word=articles_dictionary)

In [18]:
articles_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.010*"function" + 0.006*"time" + 0.006*"value" + 0.005*"let" + 0.005*"variable" + 0.005*"point" + 0.005*"learning"'),
 (1,
  '0.007*"time" + 0.006*"learning" + 0.006*"variable" + 0.005*"code" + 0.004*"algorithm" + 0.004*"method" + 0.004*"machine"'),
 (2,
  '0.011*"learning" + 0.007*"machine" + 0.005*"model" + 0.005*"feature" + 0.004*"number" + 0.004*"time" + 0.004*"training"'),
 (3,
  '0.006*"time" + 0.006*"function" + 0.005*"model" + 0.005*"code" + 0.004*"python" + 0.004*"machine" + 0.004*"number"'),
 (4,
  '0.007*"learning" + 0.005*"code" + 0.004*"python" + 0.004*"time" + 0.004*"function" + 0.004*"machine" + 0.004*"different"'),
 (5,
  '0.007*"time" + 0.006*"function" + 0.006*"word" + 0.006*"image" + 0.005*"learning" + 0.004*"feature" + 0.004*"number"'),
 (6,
  '0.005*"learning" + 0.005*"time" + 0.005*"function" + 0.004*"different" + 0.004*"file" + 0.004*"python" + 0.004*"number"'),
 (7,
  '0.008*"image" + 0.007*"learning" + 0.007*"time" + 0.007*"function" + 0.006*"feature" 

In [19]:
pyLDAvis.gensim_models.prepare(articles_bow_ldamodel, articles_bow_corpus, articles_dictionary)

In [20]:
get_dominant_topics(articles_bow_ldamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
16707,0.0,0.9998,"function, time, value, let, variable, point, l...",if you are stuck in front of the paywall use ...
30758,0.0,0.9998,"function, time, value, let, variable, point, l...",if you are stuck in front of the paywall use ...
25447,0.0,0.9998,"function, time, value, let, variable, point, l...",if you are stuck in front of the paywall use ...
23976,0.0,0.9998,"function, time, value, let, variable, point, l...",from our macroscopic human perspective the un...
24036,6.0,0.9997,"learning, time, function, different, file, pyt...",according to wikipedia data analysis is a pro...
7123,9.0,0.9997,"learning, time, feature, number, code, work, a...",it is no secret that analytics is playing an ...
1253,0.0,0.9996,"function, time, value, let, variable, point, l...",weve all heard that information is beautiful ...
13852,6.0,0.9996,"learning, time, function, different, file, pyt...",data engineering is an attractive field it st...
24821,6.0,0.9996,"learning, time, function, different, file, pyt...",there is little question big data analytics d...
463,0.0,0.9996,"function, time, value, let, variable, point, l...",as karl pearson a british mathematician has o...


### Result using LDA model + TF-IDF

In [21]:
articles_tfidf_ldamodel = gensim.models.LdaMulticore(articles_tfidf_corpus, num_topics=10, id2word=articles_dictionary)

In [22]:
articles_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"function" + 0.001*"image" + 0.001*"feature" + 0.001*"variable" + 0.001*"regression" + 0.001*"project" + 0.001*"plot"'),
 (1,
  '0.001*"image" + 0.001*"plot" + 0.001*"dataset" + 0.001*"variable" + 0.001*"cluster" + 0.001*"feature" + 0.001*"file"'),
 (2,
  '0.002*"image" + 0.001*"network" + 0.001*"business" + 0.001*"learning" + 0.001*"feature" + 0.001*"company" + 0.001*"layer"'),
 (3,
  '0.001*"word" + 0.001*"image" + 0.001*"feature" + 0.001*"function" + 0.001*"file" + 0.001*"dataset" + 0.001*"text"'),
 (4,
  '0.001*"function" + 0.001*"file" + 0.001*"image" + 0.001*"python" + 0.001*"feature" + 0.001*"company" + 0.001*"panda"'),
 (5,
  '0.001*"function" + 0.001*"panda" + 0.001*"python" + 0.001*"file" + 0.001*"object" + 0.001*"column" + 0.001*"variable"'),
 (6,
  '0.002*"image" + 0.002*"function" + 0.001*"gradient" + 0.001*"class" + 0.001*"probability" + 0.001*"training" + 0.001*"word"'),
 (7,
  '0.001*"file" + 0.001*"image" + 0.001*"python" + 0.001*"docker" + 0.001*"functio

In [23]:
pyLDAvis.gensim_models.prepare(articles_tfidf_ldamodel, articles_tfidf_corpus, articles_dictionary)

In [24]:
get_dominant_topics(articles_tfidf_ldamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
7242,2.0,0.964,"image, network, business, learning, feature, c...",the word on the street is if you dont invest ...
5311,2.0,0.9622,"image, network, business, learning, feature, c...",so you are a data science enthusiast and want...
35521,2.0,0.9621,"image, network, business, learning, feature, c...",neurips is a great conference attracting the ...
34386,2.0,0.9621,"image, network, business, learning, feature, c...",ai has been the most intriguing topic of 2018...
35773,2.0,0.9616,"image, network, business, learning, feature, c...",its really hard to build product features and...
30469,2.0,0.9615,"image, network, business, learning, feature, c...",the fact that ai and deep learning have had a...
32333,2.0,0.9612,"image, network, business, learning, feature, c...",it was another sunny spring 2018 day in burba...
9015,2.0,0.9611,"image, network, business, learning, feature, c...",2020 wasnt the greatest year so i thought why...
21391,2.0,0.9611,"image, network, business, learning, feature, c...",some thought leaders such as elon musk and th...
15844,2.0,0.9602,"image, network, business, learning, feature, c...",source johns hokpins covid dashboard a decade...
