## Import necessary libraries

In [26]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

## Read cleaned data files

### Read cleaned articles file and save in dataframe

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

### Check columns info in articles dataframe

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42322 entries, 0 to 42321
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42322 non-null  object
 1   title            42322 non-null  object
 2   author           42322 non-null  object
 3   url              42322 non-null  object
 4   claps            42322 non-null  int64 
 5   responses        42322 non-null  int64 
 6   reading_time     42322 non-null  int64 
 7   paid             42322 non-null  int64 
 8   content          42322 non-null  object
 9   cleaned_content  42232 non-null  object
 10  cleaned_author   42322 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.6+ MB


## Data Preprocessing

### Define stopwords that appear in both articles and jobs listings

In [4]:
new_common_stopwords = ['job', 'skill', 'experience', 'team', 'data', 'science', 'use', 'like', 'business', 'work', 'ability', 'let', 'example'
                'need', 'new', 'user', 'opportunity', 'candidate', 'provide', 'company', 'think', 'thing', 'function', 'learning']

In [5]:
# append the defined common stopwords with those provided by default in gensim stopwords
new_common_stopwords = STOPWORDS.union(set(new_common_stopwords))

In [6]:
# append special stopwords in articles to default and common stopwords
articles_stopwords = new_common_stopwords.union(set(['model', 'learning']))

### Method to preprocess data for articles

In [7]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in articles_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [8]:
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [9]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [10]:
# Convert into the bag-of-words (BoW) format
articles_bow_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [11]:
# Convert into TF-IDF format
articles_tfidf = models.TfidfModel(articles_bow_corpus)
articles_tfidf_corpus = articles_tfidf[articles_bow_corpus]

## LSA Model

### Result using LSA model + Bag of words

In [12]:
articles_bow_lsamodel = LsiModel(articles_bow_corpus, num_topics=10, id2word = articles_dictionary)

In [13]:
articles_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.219*"value" + 0.190*"time" + 0.169*"feature" + 0.161*"image" + 0.153*"need" + 0.149*"example" + 0.141*"number"'),
 (1,
  '-0.587*"image" + -0.356*"network" + -0.314*"layer" + -0.173*"neural" + -0.157*"input" + -0.155*"training" + -0.125*"output"'),
 (2,
  '-0.390*"value" + -0.247*"feature" + -0.226*"variable" + -0.184*"distribution" + 0.180*"image" + 0.156*"file" + 0.155*"code"'),
 (3,
  '0.704*"word" + -0.332*"image" + 0.212*"vector" + 0.163*"text" + 0.133*"sentence" + -0.118*"file" + -0.115*"code"'),
 (4,
  '-0.326*"word" + -0.243*"code" + 0.229*"network" + -0.218*"file" + -0.187*"python" + -0.165*"column" + -0.158*"image"'),
 (5,
  '0.678*"feature" + -0.178*"value" + 0.177*"model" + -0.175*"network" + 0.160*"machine" + -0.151*"distribution" + -0.149*"state"'),
 (6,
  '0.538*"image" + -0.279*"network" + -0.231*"feature" + -0.221*"layer" + 0.216*"distribution" + 0.193*"probability" + 0.186*"word"'),
 (7,
  '0.356*"value" + 0.252*"state" + -0.241*"distribution" + -0.230*"test

### Result using LSA model + TF-IDF

In [21]:
articles_tfidf_lsamodel = LsiModel(articles_tfidf_corpus, num_topics=10, id2word = articles_dictionary)

In [22]:
articles_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.141*"image" + 0.114*"network" + 0.111*"layer" + 0.105*"feature" + 0.092*"training" + 0.088*"word" + 0.083*"dataset"'),
 (1,
  '-0.333*"layer" + -0.298*"image" + -0.246*"network" + -0.173*"neural" + 0.129*"scientist" + -0.128*"weight" + -0.126*"loss"'),
 (2,
  '0.299*"image" + -0.224*"regression" + -0.196*"variable" + 0.161*"layer" + -0.154*"distribution" + -0.149*"tree" + -0.137*"probability"'),
 (3,
  '-0.237*"column" + -0.218*"file" + -0.171*"panda" + -0.153*"dataframe" + -0.125*"python" + 0.120*"network" + 0.119*"scientist"'),
 (4,
  '-0.538*"word" + -0.253*"sentence" + -0.245*"text" + -0.229*"vector" + 0.220*"image" + -0.182*"document" + -0.153*"sentiment"'),
 (5,
  '0.392*"image" + -0.226*"agent" + 0.207*"cluster" + -0.191*"reward" + -0.164*"gradient" + -0.151*"state" + -0.150*"action"'),
 (6,
  '0.346*"tree" + 0.232*"node" + -0.223*"image" + -0.168*"distribution" + -0.144*"column" + -0.140*"plot" + 0.133*"regression"'),
 (7,
  '-0.532*"cluster" + -0.271*"agent" + -0.237

## LDA Model

### Result using LDA model + Bag of words

In [27]:
articles_bow_ldamodel = gensim.models.LdaMulticore(articles_bow_corpus, num_topics=10, id2word=articles_dictionary)

In [28]:
articles_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.007*"image" + 0.006*"time" + 0.006*"code" + 0.005*"example" + 0.005*"article" + 0.005*"need" + 0.005*"training"'),
 (1,
  '0.005*"need" + 0.005*"time" + 0.005*"code" + 0.004*"example" + 0.004*"feature" + 0.003*"value" + 0.003*"people"'),
 (2,
  '0.006*"value" + 0.006*"code" + 0.005*"time" + 0.005*"example" + 0.005*"python" + 0.005*"let" + 0.004*"need"'),
 (3,
  '0.007*"value" + 0.006*"time" + 0.005*"number" + 0.005*"need" + 0.004*"let" + 0.004*"look" + 0.003*"want"'),
 (4,
  '0.007*"need" + 0.006*"machine" + 0.006*"time" + 0.005*"value" + 0.005*"word" + 0.004*"example" + 0.004*"model"'),
 (5,
  '0.010*"image" + 0.007*"network" + 0.007*"time" + 0.006*"example" + 0.005*"layer" + 0.005*"need" + 0.004*"different"'),
 (6,
  '0.006*"time" + 0.006*"value" + 0.005*"code" + 0.004*"need" + 0.004*"image" + 0.004*"number" + 0.004*"python"'),
 (7,
  '0.010*"feature" + 0.007*"value" + 0.006*"number" + 0.005*"time" + 0.005*"let" + 0.005*"variable" + 0.004*"point"'),
 (8,
  '0.009*"value" + 

In [29]:
pyLDAvis.gensim_models.prepare(articles_bow_ldamodel, articles_bow_corpus, articles_dictionary)

### Result using LDA model + TF-IDF

In [30]:
articles_tfidf_ldamodel = gensim.models.LdaMulticore(articles_tfidf_corpus, num_topics=10, id2word=articles_dictionary)

In [31]:
articles_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.002*"image" + 0.001*"layer" + 0.001*"distribution" + 0.001*"network" + 0.001*"dataset" + 0.001*"feature" + 0.001*"plot"'),
 (1,
  '0.001*"column" + 0.001*"feature" + 0.001*"variable" + 0.001*"python" + 0.001*"file" + 0.001*"plot" + 0.001*"value"'),
 (2,
  '0.001*"image" + 0.001*"column" + 0.001*"class" + 0.001*"python" + 0.001*"object" + 0.001*"dataset" + 0.001*"value"'),
 (3,
  '0.001*"file" + 0.001*"column" + 0.001*"database" + 0.001*"query" + 0.001*"python" + 0.001*"table" + 0.001*"spark"'),
 (4,
  '0.001*"file" + 0.001*"column" + 0.001*"image" + 0.001*"word" + 0.001*"text" + 0.001*"python" + 0.001*"code"'),
 (5,
  '0.001*"file" + 0.001*"column" + 0.001*"image" + 0.001*"python" + 0.001*"code" + 0.001*"variable" + 0.001*"value"'),
 (6,
  '0.002*"image" + 0.002*"network" + 0.002*"word" + 0.001*"feature" + 0.001*"layer" + 0.001*"algorithm" + 0.001*"training"'),
 (7,
  '0.001*"image" + 0.001*"network" + 0.001*"layer" + 0.001*"feature" + 0.001*"value" + 0.001*"model" + 0.001*"d

In [32]:
pyLDAvis.gensim_models.prepare(articles_tfidf_ldamodel, articles_tfidf_corpus, articles_dictionary)