## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora
from gensim.models import LsiModel

## Read cleaned data files

In [2]:
cleaned_articles_df = pd.read_csv("cleaned_articles.csv")

In [3]:
cleaned_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42322 entries, 0 to 42321
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42322 non-null  object
 1   title            42322 non-null  object
 2   author           42322 non-null  object
 3   url              42322 non-null  object
 4   claps            42322 non-null  int64 
 5   responses        42322 non-null  int64 
 6   reading_time     42322 non-null  int64 
 7   paid             42322 non-null  int64 
 8   content          42322 non-null  object
 9   cleaned_content  42232 non-null  object
 10  cleaned_author   42322 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.6+ MB


In [4]:
cleaned_jobs_listings_df = pd.read_csv("cleaned_jobs_listings.csv")

In [5]:
cleaned_jobs_listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3909 entries, 0 to 3908
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   job_title                3909 non-null   object 
 1   salary_estimate          3909 non-null   object 
 2   job_description          3909 non-null   object 
 3   rating                   3909 non-null   float64
 4   company_name             3909 non-null   object 
 5   location                 3909 non-null   object 
 6   headquarters             3909 non-null   object 
 7   size                     3909 non-null   object 
 8   founded                  3909 non-null   int64  
 9   type_of_ownership        3909 non-null   object 
 10  industry                 3909 non-null   object 
 11  sector                   3909 non-null   object 
 12  revenue                  3909 non-null   object 
 13  competitors              3909 non-null   object 
 14  easy_apply              

## Data Preprocessing

In [6]:
new_stopwords = ['job', 'skill', 'experience', 'team', 'data', 'use', 'like', 'business', 'work', 'ability', 'let', 'example'
                'need', 'new', 'user', 'opportunity', 'candidate', 'provide', 'company']

In [7]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in new_stopwords:
            result.append(lemmatize(token))
    return result

In [8]:
preprocessed_articles = cleaned_articles_df['cleaned_content'].fillna('').astype(str).map(preprocess)

In [9]:
articles_dictionary = corpora.Dictionary(preprocessed_articles)

In [10]:
articles_corpus = [articles_dictionary.doc2bow(text) for text in preprocessed_articles]

In [11]:
preprocessed_jobs_listings = cleaned_jobs_listings_df['cleaned_job_description'].fillna('').astype(str).map(preprocess)

In [12]:
jobs_listings_dictionary = corpora.Dictionary(preprocessed_jobs_listings)

In [13]:
jobs_listings_corpus = [jobs_listings_dictionary.doc2bow(text) for text in preprocessed_jobs_listings]

## LSA Model

In [14]:
articles_lsamodel = LsiModel(articles_corpus, num_topics=10, id2word = articles_dictionary)

In [15]:
articles_lsamodel.print_topics(num_words=5)

[(0,
  '0.456*"model" + 0.195*"learning" + 0.178*"value" + 0.159*"function" + 0.155*"time"'),
 (1,
  '-0.723*"model" + 0.143*"function" + 0.131*"time" + 0.124*"need" + -0.121*"training"'),
 (2,
  '-0.496*"image" + -0.342*"network" + -0.303*"layer" + 0.279*"model" + -0.277*"function"'),
 (3,
  '-0.359*"value" + 0.340*"learning" + -0.290*"function" + 0.273*"image" + -0.230*"variable"'),
 (4,
  '0.331*"image" + -0.300*"learning" + 0.267*"file" + 0.261*"code" + -0.208*"network"'),
 (5,
  '0.769*"word" + 0.223*"vector" + 0.198*"text" + -0.162*"image" + -0.157*"learning"'),
 (6,
  '-0.479*"function" + 0.387*"image" + 0.375*"feature" + -0.224*"learning" + -0.170*"model"'),
 (7,
  '-0.628*"feature" + -0.266*"learning" + 0.234*"image" + 0.198*"distribution" + 0.182*"probability"'),
 (8,
  '-0.395*"learning" + -0.343*"image" + 0.288*"network" + 0.272*"layer" + 0.228*"feature"'),
 (9,
  '0.494*"ai" + 0.338*"function" + -0.221*"network" + -0.214*"time" + 0.166*"think"')]

In [16]:
jobs_listings_lsamodel = LsiModel(jobs_listings_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [17]:
jobs_listings_lsamodel.print_topics(num_words=5)

[(0,
  '0.192*"year" + 0.174*"science" + 0.165*"development" + 0.152*"analysis" + 0.139*"skill"'),
 (1,
  '-0.245*"learning" + -0.203*"analytics" + -0.198*"machine" + 0.194*"information" + -0.173*"model"'),
 (2,
  '0.321*"equivalent" + 0.227*"education" + 0.209*"g" + 0.197*"science" + 0.176*"level"'),
 (3,
  '0.243*"quantum" + 0.199*"learning" + 0.196*"machine" + -0.172*"analysis" + 0.170*"world"'),
 (4,
  '0.342*"research" + -0.258*"year" + 0.250*"learning" + 0.187*"machine" + 0.183*"science"'),
 (5,
  '-0.244*"development" + 0.237*"information" + 0.163*"employee" + 0.163*"application" + -0.159*"research"'),
 (6,
  '-0.308*"research" + 0.253*"laboratory" + -0.188*"project" + 0.188*"medical" + -0.184*"development"'),
 (7,
  '0.265*"analytics" + -0.246*"learning" + 0.226*"analysis" + -0.211*"development" + -0.180*"laboratory"'),
 (8,
  '0.475*"product" + -0.207*"laboratory" + 0.194*"development" + -0.187*"service" + 0.152*"engineering"'),
 (9,
  '-0.388*"research" + 0.316*"product" + -0

## LDA Model

In [18]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [19]:
articles_ldamodel = gensim.models.LdaMulticore(articles_corpus, num_topics=10, id2word=articles_dictionary)

  and should_run_async(code)


In [20]:
articles_ldamodel.print_topics(num_words=5)

  and should_run_async(code)


[(0,
  '0.010*"image" + 0.006*"model" + 0.004*"time" + 0.004*"ai" + 0.004*"value"'),
 (1,
  '0.008*"model" + 0.005*"time" + 0.005*"need" + 0.004*"way" + 0.004*"value"'),
 (2,
  '0.018*"model" + 0.007*"word" + 0.007*"value" + 0.006*"network" + 0.006*"time"'),
 (3,
  '0.014*"model" + 0.007*"learning" + 0.005*"time" + 0.004*"need" + 0.004*"result"'),
 (4,
  '0.014*"model" + 0.006*"learning" + 0.006*"function" + 0.005*"time" + 0.005*"number"'),
 (5,
  '0.009*"model" + 0.006*"learning" + 0.005*"function" + 0.005*"time" + 0.005*"code"'),
 (6,
  '0.013*"model" + 0.008*"learning" + 0.005*"time" + 0.005*"value" + 0.005*"machine"'),
 (7,
  '0.009*"model" + 0.008*"value" + 0.006*"need" + 0.006*"code" + 0.006*"function"'),
 (8,
  '0.008*"model" + 0.006*"value" + 0.006*"time" + 0.005*"feature" + 0.005*"code"'),
 (9,
  '0.008*"learning" + 0.008*"value" + 0.007*"function" + 0.006*"model" + 0.006*"time"')]

In [21]:
pyLDAvis.gensim_models.prepare(articles_ldamodel, articles_corpus, articles_dictionary)

  and should_run_async(code)


In [22]:
jobs_listings_ldamodel = gensim.models.LdaMulticore(jobs_listings_corpus, num_topics=10, id2word=jobs_listings_dictionary)

  and should_run_async(code)


In [23]:
jobs_listings_ldamodel.print_topics(num_words=5)

  and should_run_async(code)


[(0,
  '0.006*"year" + 0.005*"analysis" + 0.005*"science" + 0.005*"skill" + 0.005*"development"'),
 (1,
  '0.007*"year" + 0.006*"analysis" + 0.005*"development" + 0.005*"technology" + 0.004*"knowledge"'),
 (2,
  '0.007*"technology" + 0.006*"year" + 0.005*"client" + 0.005*"skill" + 0.005*"solution"'),
 (3,
  '0.007*"product" + 0.006*"development" + 0.006*"science" + 0.006*"year" + 0.005*"technical"'),
 (4,
  '0.008*"science" + 0.007*"year" + 0.007*"analysis" + 0.006*"skill" + 0.005*"development"'),
 (5,
  '0.006*"statistical" + 0.005*"learning" + 0.005*"research" + 0.005*"product" + 0.005*"science"'),
 (6,
  '0.006*"year" + 0.006*"analysis" + 0.005*"solution" + 0.005*"analytics" + 0.005*"skill"'),
 (7,
  '0.006*"service" + 0.005*"knowledge" + 0.005*"year" + 0.005*"technology" + 0.005*"development"'),
 (8,
  '0.009*"year" + 0.007*"development" + 0.006*"science" + 0.006*"process" + 0.005*"project"'),
 (9,
  '0.006*"science" + 0.006*"year" + 0.005*"product" + 0.005*"model" + 0.005*"analyti

In [24]:
pyLDAvis.gensim_models.prepare(jobs_listings_ldamodel, jobs_listings_corpus, jobs_listings_dictionary)

  and should_run_async(code)
