## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned articles file and save in dataframe

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

### Check columns info in articles dataframe

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42220 entries, 0 to 42219
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42220 non-null  object
 1   title            42220 non-null  object
 2   author           42220 non-null  object
 3   url              42220 non-null  object
 4   claps            42220 non-null  int64 
 5   responses        42220 non-null  int64 
 6   reading_time     42220 non-null  int64 
 7   paid             42220 non-null  int64 
 8   content          42220 non-null  object
 9   cleaned_content  42219 non-null  object
 10  cleaned_author   42220 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.5+ MB


## Data Preprocessing

### Define stopwords that appear in both articles and jobs listings

In [4]:
# append special stopwords in articles to default and common stopwords in Gensim library
new_stopwords = STOPWORDS.union(set(['data', 'use', 'like', 'ability', 'let', 'example',
                'need', 'new', 'user', 'provide', 'one', 'used', 'need', 
                'see', 'make', 'follow', 'going', 'will', 'want', 'well', 'find', 
                'give', 'change', 'look', 'first', 'using',
                'know', 'model', 'science', 'think', 'looking', 'problem', 'column', 
                'vallue', 'understand', 'take', 'problem', 'information', 'scientist', 
                'might', 'add', 'now', 'many', 'might', 'column', 'value', 'create',
                'result', 'case', 'article', 'set', 'feature', 'function', 'learning', 'machine',
                'action', 'agent', 'reward', 'network', 'state', 'thing']))

### Method to preprocess data for articles

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the articles content
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [8]:
# Convert into the bag-of-words (BoW) format
articles_bow_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [9]:
# Convert into TF-IDF format
articles_tfidf = models.TfidfModel(articles_bow_corpus)
articles_tfidf_corpus = articles_tfidf[articles_bow_corpus]

## Common method to find top 10 dominant topics in the articles

In [10]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Title"] = cleaned_articles_df['title']
    dominant_topics_df["Text"] = cleaned_articles_df['cleaned_content']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
articles_bow_lsamodel = LsiModel(articles_bow_corpus, num_topics=10, id2word = articles_dictionary)

In [12]:
articles_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.216*"time" + 0.186*"image" + 0.157*"number" + 0.140*"code" + 0.140*"different" + 0.139*"training" + 0.128*"model"'),
 (1,
  '-0.690*"image" + -0.302*"layer" + -0.156*"training" + -0.152*"input" + 0.142*"time" + -0.137*"neural" + -0.125*"output"'),
 (2,
  '0.251*"image" + -0.237*"variable" + -0.226*"distribution" + -0.195*"probability" + 0.167*"file" + 0.166*"project" + -0.161*"value"'),
 (3,
  '0.763*"word" + 0.210*"vector" + 0.197*"text" + -0.182*"image" + -0.162*"variable" + 0.144*"sentence" + 0.134*"language"'),
 (4,
  '-0.356*"code" + -0.301*"file" + -0.293*"python" + -0.162*"variable" + 0.132*"people" + 0.124*"company" + -0.122*"let"'),
 (5,
  '-0.480*"image" + 0.392*"layer" + -0.252*"word" + 0.237*"input" + 0.224*"output" + 0.215*"neural" + 0.165*"weight"'),
 (6,
  '0.313*"distribution" + -0.275*"feature" + -0.269*"model" + 0.256*"probability" + -0.254*"dataset" + -0.226*"training" + 0.177*"time"'),
 (7,
  '-0.385*"cluster" + -0.331*"point" + -0.253*"algorithm" + 0.231*

<font color = "blue">
    Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Code for image detection and time series <br/>
    Topic 1: Image Recognition/Classification using neural network <br/>
    Topic 2: Probability Distribution in image processing <br/>
    Topic 3: Support vector machine(SVM) in NLP and computer vision <br/>
    Topic 4: Neural network using Python in computer vision and NLP <br/>
    Topic 5: Image classification input, output and weights using Neural Network <br/>
    Topic 6: Probability Distribution, feature engineering, train data and modelling in Time series forecasting <br/>
    Topic 7: Evaluating clustering algorithm  <br/>
    Topic 8: Probability distribution, layer and algorithm in time series <br/>
    Topic 9: Support Vector Machine (SVM) for customer segmentation
</font>

In [13]:
get_dominant_topics(articles_bow_lsamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
39097,3.0,200.8781,"word, vector, text, image, variable, sentence,...",What a CEO needs to know about Machine Learnin...,during my first project in mckinsey in 2011 i...
28350,3.0,197.8698,"word, vector, text, image, variable, sentence,...",Python Dictionary from Scratch!!!,dictionary in python comprises an unordered c...
13770,5.0,190.7227,"image, layer, word, input, output, neural, wei...",Introduction to IBM Federated Learning: A Coll...,ibm research has just released ibm federated ...
25444,0.0,180.3736,"time, image, number, code, different, training...",[Paper Summary] Deep Tree Learning for Zero-Sh...,typically one creates an algorithm or build n...
30101,3.0,158.2951,"word, vector, text, image, variable, sentence,...",Evasion attacks on Machine Learning (or “Adver...,machine learning is exciting however just lik...
25147,0.0,158.258,"time, image, number, code, different, training...",Retrieving OpenStreetMap data in Python,if you want to retrieve geospatial data from ...
35905,0.0,136.4842,"time, image, number, code, different, training...",(PersonLab) Single-shot fully-convolutional ar...,picking up where we left from we are going to...
16907,0.0,136.1179,"time, image, number, code, different, training...",Fraud detection — Unsupervised Anomaly Detection,one of the greatest concerns of many business...
30755,0.0,133.1498,"time, image, number, code, different, training...",Machines that learn by doing,in my midtwenties i learned to play tennis fo...
8874,5.0,132.9112,"image, layer, word, input, output, neural, wei...",How connected is the world? Analysis through a...,probably yes even more when the covid19 virus...


<font color = "blue">
    The most dominant topic with 6 records in the top ten articles is Topic 0: Code for image detection and time series. It seems that most recent articles are mainly written about them.
    
    However, even after many rounds of training and fine-tuning the model, the topic extracted may not exactly accurate about what is written in each document but rather it provides the big picture of what the whole collection of documents are about.
</font>

### Result using LSA model + TF-IDF

In [14]:
articles_tfidf_lsamodel = LsiModel(articles_tfidf_corpus, num_topics=10, id2word = articles_dictionary)

In [15]:
articles_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.142*"image" + 0.111*"layer" + 0.093*"training" + 0.090*"word" + 0.084*"dataset" + 0.084*"variable" + 0.083*"algorithm"'),
 (1,
  '-0.310*"layer" + -0.284*"image" + -0.152*"neural" + 0.148*"business" + 0.133*"company" + -0.125*"weight" + -0.123*"loss"'),
 (2,
  '0.352*"image" + -0.222*"regression" + -0.191*"variable" + 0.188*"layer" + -0.157*"distribution" + -0.148*"tree" + -0.139*"probability"'),
 (3,
  '-0.221*"file" + -0.164*"panda" + 0.156*"layer" + 0.148*"business" + -0.144*"dataframe" + -0.142*"python" + 0.124*"company"'),
 (4,
  '-0.532*"word" + -0.248*"sentence" + -0.238*"text" + -0.229*"vector" + 0.223*"image" + -0.178*"document" + -0.150*"embeddings"'),
 (5,
  '-0.428*"image" + -0.319*"cluster" + 0.205*"gradient" + 0.185*"layer" + 0.161*"neuron" + -0.157*"clustering" + -0.135*"customer"'),
 (6,
  '-0.380*"tree" + -0.252*"node" + 0.204*"distribution" + -0.179*"cluster" + 0.153*"image" + 0.149*"plot" + -0.140*"decision"'),
 (7,
  '-0.537*"cluster" + -0.238*"clustering"

<font color = "blue">
    Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Image and word embeddings training<br/>
    Topic 1: Image classification uisng neural network<br/>
    Topic 2: Computer Vision using logistic regression, decision tree<br/>
    Topic 3: Manipulating files using pandas<br/>
    Topic 4: Word vector for sentiment analysis<br/>
    Topic 5: Reinforcement learning<br/>
    Topic 6: Cluster analysis and clustering algorithms<br/>
    Topic 7: Probability distribution from decision tree<br/>
    Topic 8: Customer segmentation<br/>
    Topic 9: Using docker to deploy machine learning models<br/>
</font>

In [16]:
get_dominant_topics(articles_tfidf_lsamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
24115,0.0,0.4454,"image, layer, training, word, dataset, variabl...",Multivariate Outlier Detection in High-Dimensi...,in the realm of laser spectroscopy outliers a...
22911,0.0,0.4336,"image, layer, training, word, dataset, variabl...",Neural Networks: From Zero to Hero,throughout this article will be covered the f...
34284,0.0,0.4238,"image, layer, training, word, dataset, variabl...",Building an Experimentation Framework for Comp...,weka offers a comprehensive suite of librarie...
32133,0.0,0.4219,"image, layer, training, word, dataset, variabl...","Web Scraping For Beginners Beautifulsoup,Scrap...",i was learning about web scraping recently an...
34383,0.0,0.4185,"image, layer, training, word, dataset, variabl...",Analyzing and Predicting Starbucks’ Location S...,in boston it feels like you cant walk more th...
9013,0.0,0.4159,"image, layer, training, word, dataset, variabl...",The home cloud revolution: how to host your pe...,google tracks you yes i know it is not a shoc...
5310,0.0,0.4125,"image, layer, training, word, dataset, variabl...",Time Series Forecasting using TensorFlow and D...,in my previous tds article i described about ...
25147,0.0,0.4116,"image, layer, training, word, dataset, variabl...",Retrieving OpenStreetMap data in Python,if you want to retrieve geospatial data from ...
34978,0.0,0.4047,"image, layer, training, word, dataset, variabl...",Decision Making as a Random Walk,do i make every decision i make correctly pro...
4812,0.0,0.3928,"image, layer, training, word, dataset, variabl...",Best Cities to Work as a Data Scientist,the job landscape for data scientists is prom...


<font color = "blue">The most dominant topic is Topic 0: Image and word embeddings training
</font>

## LDA Model

### Result using LDA model + Bag of words

In [17]:
articles_bow_ldamodel = gensim.models.LdaMulticore(articles_bow_corpus, num_topics=10, id2word=articles_dictionary)

In [18]:
articles_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.006*"time" + 0.006*"point" + 0.005*"code" + 0.004*"work" + 0.004*"project" + 0.004*"different" + 0.004*"let"'),
 (1,
  '0.006*"time" + 0.005*"file" + 0.004*"different" + 0.004*"image" + 0.004*"work" + 0.004*"let" + 0.004*"python"'),
 (2,
  '0.008*"time" + 0.005*"image" + 0.004*"layer" + 0.003*"work" + 0.003*"training" + 0.003*"model" + 0.003*"different"'),
 (3,
  '0.007*"time" + 0.006*"code" + 0.005*"let" + 0.005*"image" + 0.004*"python" + 0.004*"method" + 0.004*"model"'),
 (4,
  '0.006*"image" + 0.005*"code" + 0.005*"time" + 0.005*"number" + 0.004*"let" + 0.004*"model" + 0.004*"different"'),
 (5,
  '0.007*"code" + 0.005*"time" + 0.005*"dataset" + 0.004*"number" + 0.004*"python" + 0.004*"work" + 0.004*"different"'),
 (6,
  '0.007*"word" + 0.007*"time" + 0.006*"number" + 0.005*"image" + 0.004*"training" + 0.004*"different" + 0.004*"value"'),
 (7,
  '0.006*"time" + 0.005*"code" + 0.005*"let" + 0.004*"value" + 0.004*"algorithm" + 0.004*"training" + 0.004*"number"'),
 (8,
  '0.00

<font color = "blue">
 Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Using neural network in computer vision and time series <br/>
    Topic 1: NLP and time series<br/>
    Topic 2: Computer vision and time series<br/>
    Topic 3: Data Science team<br/>
    Topic 4: Time series code<br/>
    Topic 5: Steps and code for time series algorithm<br/>
    Topic 6: Training computer vision and time series<br/>
    Topic 7: Computer vision dataset<br/>
    Topic 8: Working code for computer vision projects<br/>
    Topic 9: Technical indicators in a time series<br/>
</font>

In [19]:
pyLDAvis.gensim_models.prepare(articles_bow_ldamodel, articles_bow_corpus, articles_dictionary)

In [20]:
get_dominant_topics(articles_bow_ldamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
18237,8.0,0.9997,"variable, image, time, number, model, differen...",Algorithmic Bias and the Confusion Matrix Dash...,as algorithms increasingly make decisions abo...
505,8.0,0.9996,"variable, image, time, number, model, differen...","A Statistical Analysis of Social, Urban and Na...",this article aims to explore various social u...
39100,6.0,0.9996,"word, time, number, image, training, different...",A hands-on intuitive approach to Deep Learning...,working with unstructured text data is hard e...
4356,0.0,0.9996,"time, point, code, work, project, different, l...",Official 2021 NHL Season Previews,i recently built a projection model for the 2...
7273,1.0,0.9995,"time, file, different, image, work, let, pytho...","A Comparative Study on Cloudera, Amazon Web Se...",abstract this paper is to compare the cloud ...
6447,6.0,0.9995,"word, time, number, image, training, different...",Topic Modeling with Latent Dirichlet Allocation,topic modeling is a form of unsupervised mach...
2002,8.0,0.9995,"variable, image, time, number, model, differen...",Portfolio Diversification With Emerging Market...,we have little to no control over the actual ...
13121,8.0,0.9995,"variable, image, time, number, model, differen...",How to Effectively Predict Imbalanced Classes ...,by a curious twist of fate the amount of time...
1838,2.0,0.9995,"time, image, layer, work, training, model, dif...",The Global Artificial Intelligence Race and St...,artificial intelligence ai has the potential ...
7120,8.0,0.9995,"variable, image, time, number, model, differen...",Applications of Linear Programming Problem (LPP),true optimization is the revolutionary contri...


<font color = "blue">The most dominant topic is Topic 3: Data Science team
</font>

### Result using LDA model + TF-IDF

In [21]:
articles_tfidf_ldamodel = gensim.models.LdaMulticore(articles_tfidf_corpus, num_topics=10, id2word=articles_dictionary)

In [22]:
articles_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"image" + 0.001*"file" + 0.001*"plot" + 0.001*"python" + 0.001*"array" + 0.001*"variable" + 0.001*"table"'),
 (1,
  '0.001*"python" + 0.001*"plot" + 0.001*"file" + 0.001*"project" + 0.001*"image" + 0.001*"environment" + 0.001*"variable"'),
 (2,
  '0.001*"image" + 0.001*"file" + 0.001*"command" + 0.001*"python" + 0.001*"code" + 0.001*"cluster" + 0.001*"database"'),
 (3,
  '0.001*"image" + 0.001*"word" + 0.001*"dataset" + 0.001*"training" + 0.001*"text" + 0.001*"class" + 0.001*"probability"'),
 (4,
  '0.003*"image" + 0.003*"layer" + 0.002*"training" + 0.002*"class" + 0.002*"word" + 0.002*"neural" + 0.001*"gradient"'),
 (5,
  '0.001*"panda" + 0.001*"dataframe" + 0.001*"dataset" + 0.001*"spark" + 0.001*"image" + 0.001*"column" + 0.001*"python"'),
 (6,
  '0.001*"word" + 0.001*"player" + 0.001*"image" + 0.001*"layer" + 0.001*"text" + 0.001*"file" + 0.001*"sentence"'),
 (7,
  '0.001*"file" + 0.001*"docker" + 0.001*"image" + 0.001*"variable" + 0.001*"container" + 0.001*"object" +

<font color = "blue">
 Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Training images for computer vision projects<br/>
    Topic 1: Computer vision steps<br/>
    Topic 2: Confusion matrix, heatmaps and correlation matrix plots<br/>
    Topic 3: Probability results from training machine learning results<br/>
    Topic 4: Image classification using logistic regression<br/>
    Topic 5: Using python for image clustering<br/>
    Topic 6: Training in computer vision and analysis<br/>
    Topic 7: File management in python<br/>
    Topic 8: Distribution using image processing<br/>
    Topic 9: Image clustering and word embedding algorithm<br/>
</font>

In [23]:
pyLDAvis.gensim_models.prepare(articles_tfidf_ldamodel, articles_tfidf_corpus, articles_dictionary)

In [24]:
get_dominant_topics(articles_tfidf_ldamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
7242,9.0,0.9638,"business, company, team, customer, project, pe...",The case against investing in machine learning...,the word on the street is if you dont invest ...
34386,9.0,0.9622,"business, company, team, customer, project, pe...","Notes on Artificial Intelligence (AI), Machine...",ai has been the most intriguing topic of 2018...
5311,9.0,0.9618,"business, company, team, customer, project, pe...",12 Steps For Beginner To Pro In Data Science I...,so you are a data science enthusiast and want...
32333,9.0,0.9615,"business, company, team, customer, project, pe...","A New Data Scientist’s Reflections on Culture,...",it was another sunny spring 2018 day in burba...
35773,9.0,0.9613,"business, company, team, customer, project, pe...",An Experimental Development Process for Making...,its really hard to build product features and...
21391,9.0,0.9612,"business, company, team, customer, project, pe...",The Real AI Crisis,some thought leaders such as elon musk and th...
34896,9.0,0.9609,"business, company, team, customer, project, pe...",Artificial Intelligence Demystified,ai is this years buzzword of choice across th...
30469,9.0,0.9608,"business, company, team, customer, project, pe...",Next Level Art and the Future of Work and Leisure,the fact that ai and deep learning have had a...
15844,9.0,0.96,"business, company, team, customer, project, pe...",Data Science in a Post Crisis World,source johns hokpins covid dashboard a decade...
21184,9.0,0.9596,"business, company, team, customer, project, pe...",OVER 100 Data Scientist Interview Questions an...,i know this is long really long but dont be i...


<font color = "blue">The most dominant topic is Topic 2: Confusion matrix, heatmaps and correlation matrix plots. Knowing how to evaluate machine learning modes are popular topics to write and they are also important process as part of evaluation in data science.
</font>