## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned articles file and save in dataframe

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

### Check columns info in articles dataframe

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42220 entries, 0 to 42219
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42220 non-null  object
 1   title            42220 non-null  object
 2   author           42220 non-null  object
 3   url              42220 non-null  object
 4   claps            42220 non-null  int64 
 5   responses        42220 non-null  int64 
 6   reading_time     42220 non-null  int64 
 7   paid             42220 non-null  int64 
 8   content          42220 non-null  object
 9   cleaned_content  42219 non-null  object
 10  cleaned_author   42220 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.5+ MB


## Data Preprocessing

### Define stopwords that appear in both articles and jobs listings

In [4]:
# append special stopwords in articles to default and common stopwords in Gensim library
new_stopwords = STOPWORDS.union(set(['data', 'use', 'like', 'ability', 'let', 'example',
                'need', 'new', 'user', 'provide', 'one', 'used', 'need', 
                'see', 'make', 'follow', 'going', 'will', 'want', 'well', 'find', 
                'give', 'change', 'look', 'first', 'using',
                'know', 'model', 'science', 'think', 'looking', 'problem', 'column', 
                'vallue', 'understand', 'take', 'problem', 'information', 'scientist', 
                'might', 'add', 'now', 'many', 'might', 'column', 'value', 'create',
                'result', 'case', 'article', 'set', 'feature', 'function', 'learning', 'machine',
                'action', 'agent', 'reward']))

### Method to preprocess data for articles

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the articles content
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [8]:
# Convert into the bag-of-words (BoW) format
articles_bow_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [9]:
# Convert into TF-IDF format
articles_tfidf = models.TfidfModel(articles_bow_corpus)
articles_tfidf_corpus = articles_tfidf[articles_bow_corpus]

## Common method to find top 10 dominant topics in the articles

In [10]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Title"] = cleaned_articles_df['title']
    dominant_topics_df["Text"] = cleaned_articles_df['cleaned_content']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
articles_bow_lsamodel = LsiModel(articles_bow_corpus, num_topics=10, id2word = articles_dictionary)

In [12]:
articles_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.211*"time" + 0.187*"image" + 0.159*"network" + 0.155*"number" + 0.139*"training" + 0.138*"different" + 0.136*"code"'),
 (1,
  '-0.584*"image" + -0.349*"network" + -0.311*"layer" + -0.170*"neural" + -0.155*"input" + -0.145*"training" + 0.141*"time"'),
 (2,
  '0.304*"image" + -0.234*"variable" + -0.224*"distribution" + -0.195*"probability" + 0.170*"file" + 0.163*"code" + 0.162*"project"'),
 (3,
  '0.739*"word" + -0.229*"image" + 0.200*"vector" + 0.190*"text" + -0.174*"variable" + 0.140*"sentence" + 0.134*"language"'),
 (4,
  '0.300*"network" + -0.281*"code" + -0.242*"file" + -0.224*"image" + -0.223*"word" + -0.221*"python" + 0.157*"neural"'),
 (5,
  '0.503*"image" + -0.293*"network" + -0.232*"code" + -0.226*"layer" + 0.216*"word" + -0.202*"python" + -0.169*"file"'),
 (6,
  '0.318*"distribution" + 0.277*"probability" + -0.271*"feature" + -0.267*"model" + -0.240*"dataset" + -0.225*"training" + 0.190*"image"'),
 (7,
  '-0.374*"cluster" + -0.319*"point" + -0.250*"algorithm" + 0.234

<font color = "blue">
    Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Code for image detection and time series <br/>
    Topic 1: Image Recognition/Classification using neural network <br/>
    Topic 2: Probability Distribution <br/>
    Topic 3: NLP word vector <br/>
    Topic 4: Neural network using Python <br/>
    Topic 5: Computer Vision using Neural Network <br/>
    Topic 6: Probability Distributions <br/>
    Topic 7: State Machine <br/>
    Topic 8: Distribution Clustering <br/>
    Topic 9: Support Vector Machine (SVM)
</font>

In [13]:
get_dominant_topics(articles_bow_lsamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
39097,3.0,193.7726,"word, image, vector, text, variable, sentence,...",What a CEO needs to know about Machine Learnin...,during my first project in mckinsey in 2011 i...
28350,3.0,192.3483,"word, image, vector, text, variable, sentence,...",Python Dictionary from Scratch!!!,dictionary in python comprises an unordered c...
25444,0.0,177.7688,"time, image, network, number, training, differ...",[Paper Summary] Deep Tree Learning for Zero-Sh...,typically one creates an algorithm or build n...
13770,0.0,173.4651,"time, image, network, number, training, differ...",Introduction to IBM Federated Learning: A Coll...,ibm research has just released ibm federated ...
16907,0.0,157.4623,"time, image, network, number, training, differ...",Fraud detection — Unsupervised Anomaly Detection,one of the greatest concerns of many business...
25147,0.0,155.5096,"time, image, network, number, training, differ...",Retrieving OpenStreetMap data in Python,if you want to retrieve geospatial data from ...
30101,3.0,153.0521,"word, image, vector, text, variable, sentence,...",Evasion attacks on Machine Learning (or “Adver...,machine learning is exciting however just lik...
35905,0.0,138.5032,"time, image, network, number, training, differ...",(PersonLab) Single-shot fully-convolutional ar...,picking up where we left from we are going to...
30755,0.0,132.4417,"time, image, network, number, training, differ...",Machines that learn by doing,in my midtwenties i learned to play tennis fo...
16704,0.0,129.7537,"time, image, network, number, training, differ...",Using Kafka as a Temporary Data Store and Data...,apache kafka is a streaming platform that all...


<font color = "blue">
    The most dominant topic with 6 records in the top ten articles is Topic 0: Code for image detection and time series. It seems that most recent articles are mainly written about them.
</font>

### Result using LSA model + TF-IDF

In [14]:
articles_tfidf_lsamodel = LsiModel(articles_tfidf_corpus, num_topics=10, id2word = articles_dictionary)

In [15]:
articles_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.143*"image" + 0.117*"network" + 0.113*"layer" + 0.093*"training" + 0.089*"word" + 0.084*"dataset" + 0.083*"variable"'),
 (1,
  '-0.321*"layer" + -0.286*"image" + -0.237*"network" + -0.167*"neural" + 0.140*"business" + -0.125*"weight" + 0.125*"company"'),
 (2,
  '0.310*"image" + -0.227*"regression" + -0.198*"variable" + 0.173*"layer" + -0.159*"distribution" + -0.152*"tree" + -0.141*"probability"'),
 (3,
  '-0.229*"file" + -0.157*"panda" + 0.151*"business" + 0.141*"network" + -0.141*"python" + -0.138*"dataframe" + 0.128*"company"'),
 (4,
  '-0.531*"word" + -0.249*"sentence" + -0.238*"text" + -0.228*"vector" + 0.223*"image" + -0.178*"document" + -0.151*"sentiment"'),
 (5,
  '-0.468*"image" + -0.258*"cluster" + 0.193*"gradient" + 0.168*"layer" + 0.159*"neuron" + 0.144*"network" + -0.131*"clustering"'),
 (6,
  '-0.375*"tree" + -0.255*"node" + 0.203*"distribution" + -0.194*"cluster" + 0.160*"image" + 0.147*"plot" + -0.137*"decision"'),
 (7,
  '-0.562*"cluster" + -0.249*"clustering"

<font color = "blue">
    Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Training word embeddings<br/>
    Topic 1: Image classification uisng neural network<br/>
    Topic 2: Computer Vision using logistic regression, decision tree<br/>
    Topic 3: Manipulating files using pandas<br/>
    Topic 4: Word vector for sentiment analysis<br/>
    Topic 5: Reinforcement learning<br/>
    Topic 6: Cluster analysis and clustering algorithms<br/>
    Topic 7: Probability distribution from decision tree<br/>
    Topic 8: Customer segmentation<br/>
    Topic 9: Using docker to deploy machine learning models<br/>
</font>

In [16]:
get_dominant_topics(articles_tfidf_lsamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
24115,0.0,0.4493,"image, network, layer, training, word, dataset...",Multivariate Outlier Detection in High-Dimensi...,in the realm of laser spectroscopy outliers a...
22911,0.0,0.4472,"image, network, layer, training, word, dataset...",Neural Networks: From Zero to Hero,throughout this article will be covered the f...
34284,0.0,0.4297,"image, network, layer, training, word, dataset...",Building an Experimentation Framework for Comp...,weka offers a comprehensive suite of librarie...
34383,0.0,0.4284,"image, network, layer, training, word, dataset...",Analyzing and Predicting Starbucks’ Location S...,in boston it feels like you cant walk more th...
32133,0.0,0.4205,"image, network, layer, training, word, dataset...","Web Scraping For Beginners Beautifulsoup,Scrap...",i was learning about web scraping recently an...
9013,0.0,0.4174,"image, network, layer, training, word, dataset...",The home cloud revolution: how to host your pe...,google tracks you yes i know it is not a shoc...
34978,0.0,0.4107,"image, network, layer, training, word, dataset...",Decision Making as a Random Walk,do i make every decision i make correctly pro...
5310,0.0,0.4102,"image, network, layer, training, word, dataset...",Time Series Forecasting using TensorFlow and D...,in my previous tds article i described about ...
25147,0.0,0.4086,"image, network, layer, training, word, dataset...",Retrieving OpenStreetMap data in Python,if you want to retrieve geospatial data from ...
19046,0.0,0.403,"image, network, layer, training, word, dataset...",7 Tools to Create A Rockstar Data Science Port...,according to the latest stats on coursera ove...


## LDA Model

### Result using LDA model + Bag of words

In [17]:
articles_bow_ldamodel = gensim.models.LdaMulticore(articles_bow_corpus, num_topics=10, id2word=articles_dictionary)

In [18]:
articles_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.006*"image" + 0.005*"time" + 0.005*"model" + 0.004*"file" + 0.004*"let" + 0.004*"number" + 0.003*"different"'),
 (1,
  '0.007*"time" + 0.004*"value" + 0.004*"variable" + 0.004*"algorithm" + 0.004*"let" + 0.004*"point" + 0.003*"different"'),
 (2,
  '0.008*"time" + 0.006*"code" + 0.006*"word" + 0.006*"dataset" + 0.005*"number" + 0.004*"value" + 0.004*"different"'),
 (3,
  '0.011*"image" + 0.006*"network" + 0.006*"layer" + 0.005*"training" + 0.004*"different" + 0.004*"algorithm" + 0.004*"model"'),
 (4,
  '0.006*"time" + 0.005*"let" + 0.004*"algorithm" + 0.004*"number" + 0.003*"different" + 0.003*"code" + 0.003*"model"'),
 (5,
  '0.006*"code" + 0.006*"time" + 0.005*"let" + 0.005*"word" + 0.005*"image" + 0.004*"number" + 0.004*"step"'),
 (6,
  '0.006*"time" + 0.006*"number" + 0.005*"different" + 0.004*"model" + 0.004*"code" + 0.004*"let" + 0.003*"work"'),
 (7,
  '0.006*"time" + 0.005*"feature" + 0.004*"code" + 0.004*"training" + 0.004*"method" + 0.004*"model" + 0.003*"different"')

<font color = "blue">
 Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Using neural network in computer vision and time series <br/>
    Topic 1: NLP and time series<br/>
    Topic 2: Computer vision and time series<br/>
    Topic 3: Data Science team<br/>
    Topic 4: Time series code<br/>
    Topic 5: Steps and code for time series algorithm<br/>
    Topic 6: Training computer vision and time series<br/>
    Topic 7: Computer vision dataset<br/>
    Topic 8: Working code for computer vision projects<br/>
    Topic 9: Technical indicators in a time series<br/>
</font>

In [19]:
pyLDAvis.gensim_models.prepare(articles_bow_ldamodel, articles_bow_corpus, articles_dictionary)

In [20]:
get_dominant_topics(articles_bow_ldamodel, articles_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
24071,5.0,0.9997,"code, time, let, word, image, number, step, va...",Understanding Singular Value Decomposition and...,in linear algebra the singular value decompos...
20094,3.0,0.9997,"image, network, layer, training, different, al...",Analysis and Applications of Multi-Scale CNN F...,in this blog post we present a formal treatme...
34037,3.0,0.9997,"image, network, layer, training, different, al...",A Comprehensive Introduction to Different Type...,if youve heard of different kinds of convolut...
33718,3.0,0.9996,"image, network, layer, training, different, al...","Deep learning based super resolution, without ...",this article describes the techniques and tra...
1327,3.0,0.9996,"image, network, layer, training, different, al...",Super Resolution: Adobe Photoshop versus Leadi...,how effective is adobes super resolution comp...
13002,3.0,0.9995,"image, network, layer, training, different, al...",10 Papers You Should Read to Understand Image ...,computer vision is a subject to convert image...
219,1.0,0.9995,"time, value, variable, algorithm, let, point, ...",The Geometry of Exponential Growth,2500 years ago greek mathematicians had a pro...
6217,3.0,0.9995,"image, network, layer, training, different, al...",Exploring Convolutional Neural Network Archite...,in part 1 building an image database weve scr...
759,3.0,0.9995,"image, network, layer, training, different, al...",Top,ai and computer science that enables automate...
3671,8.0,0.9995,"time, image, work, test, code, number, network...",Hypothesis test by hand,remember that descriptive statistics is the b...


### Result using LDA model + TF-IDF

In [21]:
articles_tfidf_ldamodel = gensim.models.LdaMulticore(articles_tfidf_corpus, num_topics=10, id2word=articles_dictionary)

In [22]:
articles_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"python" + 0.001*"file" + 0.001*"image" + 0.001*"environment" + 0.001*"code" + 0.001*"business" + 0.001*"project"'),
 (1,
  '0.001*"image" + 0.001*"variable" + 0.001*"matrix" + 0.001*"layer" + 0.001*"network" + 0.001*"probability" + 0.001*"object"'),
 (2,
  '0.001*"word" + 0.001*"image" + 0.001*"network" + 0.001*"algorithm" + 0.001*"business" + 0.001*"training" + 0.001*"company"'),
 (3,
  '0.001*"word" + 0.001*"image" + 0.001*"class" + 0.001*"probability" + 0.001*"variable" + 0.001*"file" + 0.001*"dataset"'),
 (4,
  '0.001*"image" + 0.001*"regression" + 0.001*"network" + 0.001*"variable" + 0.001*"distribution" + 0.001*"value" + 0.001*"dataset"'),
 (5,
  '0.001*"file" + 0.001*"spark" + 0.001*"cluster" + 0.001*"image" + 0.001*"variable" + 0.001*"command" + 0.001*"python"'),
 (6,
  '0.002*"image" + 0.001*"layer" + 0.001*"network" + 0.001*"training" + 0.001*"file" + 0.001*"plot" + 0.001*"dataset"'),
 (7,
  '0.001*"file" + 0.001*"layer" + 0.001*"variable" + 0.001*"image" + 0.0

<font color = "blue">
 Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>
    Topic 0: Training images for computer vision projects<br/>
    Topic 1: Computer vision steps<br/>
    Topic 2: <br/>
    Topic 3: <br/>
    Topic 4: <br/>
    Topic 5: <br/>
    Topic 6: <br/>
    Topic 7: <br/>
    Topic 8: <br/>
    Topic 9: <br/>
</font>

In [23]:
pyLDAvis.gensim_models.prepare(articles_tfidf_ldamodel, articles_tfidf_corpus, articles_dictionary)

In [24]:
get_dominant_topics(articles_tfidf_ldamodel, articles_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Title,Text
7242,2.0,0.9643,"word, image, network, algorithm, business, tra...",The case against investing in machine learning...,the word on the street is if you dont invest ...
5311,2.0,0.9623,"word, image, network, algorithm, business, tra...",12 Steps For Beginner To Pro In Data Science I...,so you are a data science enthusiast and want...
34386,2.0,0.9622,"word, image, network, algorithm, business, tra...","Notes on Artificial Intelligence (AI), Machine...",ai has been the most intriguing topic of 2018...
35521,2.0,0.9622,"word, image, network, algorithm, business, tra...",NIPS/NeurIPS 2018: Best* of the First Two Post...,neurips is a great conference attracting the ...
30469,2.0,0.9616,"word, image, network, algorithm, business, tra...",Next Level Art and the Future of Work and Leisure,the fact that ai and deep learning have had a...
32333,2.0,0.9615,"word, image, network, algorithm, business, tra...","A New Data Scientist’s Reflections on Culture,...",it was another sunny spring 2018 day in burba...
35773,2.0,0.9614,"word, image, network, algorithm, business, tra...",An Experimental Development Process for Making...,its really hard to build product features and...
9015,2.0,0.9614,"word, image, network, algorithm, business, tra...",120+ Data Scientist Interview Questions and An...,2020 wasnt the greatest year so i thought why...
21391,2.0,0.9613,"word, image, network, algorithm, business, tra...",The Real AI Crisis,some thought leaders such as elon musk and th...
34896,2.0,0.9608,"word, image, network, algorithm, business, tra...",Artificial Intelligence Demystified,ai is this years buzzword of choice across th...
