## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models import LsiModel

## Read cleaned data files

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42322 entries, 0 to 42321
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42322 non-null  object
 1   title            42322 non-null  object
 2   author           42322 non-null  object
 3   url              42322 non-null  object
 4   claps            42322 non-null  int64 
 5   responses        42322 non-null  int64 
 6   reading_time     42322 non-null  int64 
 7   paid             42322 non-null  int64 
 8   content          42322 non-null  object
 9   cleaned_content  42232 non-null  object
 10  cleaned_author   42322 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.6+ MB


In [4]:
cleaned_jobs_listings_df = pd.read_csv("cleaned_jobs_listings.csv")

In [5]:
cleaned_jobs_listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3909 entries, 0 to 3908
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   job_title                3909 non-null   object 
 1   salary_estimate          3909 non-null   object 
 2   job_description          3909 non-null   object 
 3   rating                   3909 non-null   float64
 4   company_name             3909 non-null   object 
 5   location                 3909 non-null   object 
 6   headquarters             3909 non-null   object 
 7   size                     3909 non-null   object 
 8   founded                  3909 non-null   int64  
 9   type_of_ownership        3909 non-null   object 
 10  industry                 3909 non-null   object 
 11  sector                   3909 non-null   object 
 12  revenue                  3909 non-null   object 
 13  competitors              3909 non-null   object 
 14  easy_apply              

## Data Preprocessing

In [10]:
new_stopwords = ['job', 'skill', 'experience', 'team', 'data', 'use', 'like', 'business', 'work', 'ability', 'let', 'example'
                'need', 'new', 'user', 'opportunity', 'candidate', 'provide', 'company']

In [11]:
new_stopwords = STOPWORDS.union(set(new_stopwords))

In [12]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [13]:
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [14]:
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [15]:
articles_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [16]:
preprocessed_jobs_listings = cleaned_jobs_listings_df['cleaned_job_description'].fillna('').astype(str).map(preprocess)

In [17]:
jobs_listings_dictionary = corpora.Dictionary(preprocessed_jobs_listings)

In [18]:
jobs_listings_corpus = [jobs_listings_dictionary.doc2bow(text) for text in preprocessed_jobs_listings]

## LSA Model

In [19]:
articles_lsamodel = LsiModel(articles_corpus, num_topics=10, id2word = articles_dictionary)

In [20]:
articles_lsamodel.print_topics(num_words=5)

[(0,
  '0.470*"model" + 0.200*"learning" + 0.182*"value" + 0.162*"function" + 0.157*"time"'),
 (1,
  '-0.732*"model" + 0.180*"function" + 0.132*"time" + 0.124*"need" + 0.123*"value"'),
 (2,
  '-0.556*"image" + -0.374*"network" + -0.313*"layer" + 0.251*"model" + -0.187*"function"'),
 (3,
  '-0.368*"value" + -0.353*"function" + 0.349*"learning" + -0.214*"variable" + 0.211*"machine"'),
 (4,
  '0.390*"image" + -0.346*"learning" + 0.258*"file" + 0.257*"code" + -0.223*"network"'),
 (5,
  '0.768*"word" + 0.220*"vector" + -0.201*"learning" + 0.200*"text" + 0.143*"sentence"'),
 (6,
  '-0.491*"function" + 0.394*"feature" + 0.356*"image" + -0.190*"model" + -0.182*"learning"'),
 (7,
  '-0.660*"feature" + 0.250*"image" + 0.216*"distribution" + 0.203*"probability" + -0.188*"learning"'),
 (8,
  '-0.387*"learning" + -0.329*"image" + 0.316*"network" + 0.260*"layer" + 0.217*"time"'),
 (9,
  '-0.331*"state" + 0.261*"function" + -0.241*"action" + 0.217*"think" + 0.179*"science"')]

In [21]:
jobs_listings_lsamodel = LsiModel(jobs_listings_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [22]:
jobs_listings_lsamodel.print_topics(num_words=5)

[(0,
  '0.193*"year" + 0.176*"science" + 0.167*"development" + 0.153*"analysis" + 0.140*"skill"'),
 (1,
  '0.261*"learning" + 0.211*"machine" + 0.211*"analytics" + -0.202*"information" + 0.180*"model"'),
 (2,
  '-0.260*"quantum" + 0.213*"equivalent" + -0.187*"world" + 0.169*"education" + -0.160*"employee"'),
 (3,
  '0.289*"equivalent" + 0.255*"science" + 0.214*"learning" + 0.197*"education" + 0.195*"machine"'),
 (4,
  '-0.361*"research" + 0.277*"year" + -0.245*"learning" + -0.188*"machine" + 0.171*"technology"'),
 (5,
  '-0.251*"information" + 0.247*"development" + -0.171*"application" + -0.171*"employee" + -0.160*"learning"'),
 (6,
  '-0.327*"research" + 0.280*"laboratory" + -0.219*"development" + 0.205*"medical" + -0.197*"project"'),
 (7,
  '0.266*"analytics" + -0.250*"development" + -0.242*"learning" + 0.233*"analysis" + 0.177*"quantum"'),
 (8,
  '0.485*"product" + -0.226*"laboratory" + -0.164*"learning" + 0.152*"people" + -0.147*"service"'),
 (9,
  '0.439*"research" + -0.329*"produ

## LDA Model

In [23]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [24]:
articles_ldamodel = gensim.models.LdaMulticore(articles_corpus, num_topics=10, id2word=articles_dictionary)

  and should_run_async(code)


In [25]:
articles_ldamodel.print_topics(num_words=5)

  and should_run_async(code)


[(0,
  '0.016*"model" + 0.007*"function" + 0.007*"value" + 0.006*"code" + 0.005*"time"'),
 (1,
  '0.018*"model" + 0.011*"feature" + 0.007*"function" + 0.007*"value" + 0.006*"time"'),
 (2,
  '0.013*"model" + 0.011*"learning" + 0.005*"value" + 0.005*"time" + 0.005*"need"'),
 (3,
  '0.011*"model" + 0.007*"value" + 0.006*"time" + 0.006*"code" + 0.006*"need"'),
 (4,
  '0.008*"learning" + 0.007*"network" + 0.006*"model" + 0.005*"time" + 0.005*"state"'),
 (5,
  '0.008*"model" + 0.008*"learning" + 0.006*"machine" + 0.006*"time" + 0.005*"python"'),
 (6,
  '0.012*"model" + 0.006*"time" + 0.005*"learning" + 0.005*"need" + 0.005*"project"'),
 (7,
  '0.011*"model" + 0.006*"value" + 0.006*"need" + 0.005*"time" + 0.005*"let"'),
 (8,
  '0.014*"model" + 0.006*"image" + 0.005*"value" + 0.005*"learning" + 0.005*"function"'),
 (9,
  '0.010*"model" + 0.007*"image" + 0.006*"function" + 0.006*"value" + 0.006*"time"')]

In [26]:
pyLDAvis.gensim_models.prepare(articles_ldamodel, articles_corpus, articles_dictionary)

  and should_run_async(code)


In [27]:
jobs_listings_ldamodel = gensim.models.LdaMulticore(jobs_listings_corpus, num_topics=10, id2word=jobs_listings_dictionary)

  and should_run_async(code)


In [28]:
jobs_listings_ldamodel.print_topics(num_words=5)

  and should_run_async(code)


[(0,
  '0.007*"year" + 0.006*"science" + 0.006*"development" + 0.005*"solution" + 0.005*"analytics"'),
 (1,
  '0.006*"science" + 0.006*"year" + 0.006*"skill" + 0.005*"analysis" + 0.005*"development"'),
 (2,
  '0.008*"year" + 0.006*"science" + 0.006*"project" + 0.005*"technology" + 0.005*"product"'),
 (3,
  '0.008*"science" + 0.005*"technology" + 0.005*"learning" + 0.005*"research" + 0.005*"development"'),
 (4,
  '0.010*"year" + 0.006*"research" + 0.006*"skill" + 0.006*"analysis" + 0.005*"project"'),
 (5,
  '0.008*"development" + 0.006*"year" + 0.006*"analysis" + 0.006*"information" + 0.005*"research"'),
 (6,
  '0.007*"analysis" + 0.006*"science" + 0.006*"year" + 0.006*"skill" + 0.006*"technology"'),
 (7,
  '0.007*"year" + 0.005*"working" + 0.005*"science" + 0.005*"project" + 0.005*"process"'),
 (8,
  '0.006*"year" + 0.006*"science" + 0.005*"learning" + 0.005*"analysis" + 0.005*"analytics"'),
 (9,
  '0.006*"development" + 0.006*"technology" + 0.006*"learning" + 0.006*"year" + 0.006*"pro

In [29]:
pyLDAvis.gensim_models.prepare(jobs_listings_ldamodel, jobs_listings_corpus, jobs_listings_dictionary)

  and should_run_async(code)
