## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned jobs listings file and save in dataframe

In [2]:
cleaned_jobs_listings_df = pd.read_csv("cleaned_jobs_listings.csv")

### Check columns info in jobs listings dataframe

In [3]:
cleaned_jobs_listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3909 entries, 0 to 3908
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   job_title                3909 non-null   object 
 1   salary_estimate          3909 non-null   object 
 2   job_description          3909 non-null   object 
 3   rating                   3909 non-null   float64
 4   company_name             3909 non-null   object 
 5   location                 3909 non-null   object 
 6   headquarters             3909 non-null   object 
 7   size                     3909 non-null   object 
 8   founded                  3909 non-null   int64  
 9   type_of_ownership        3909 non-null   object 
 10  industry                 3909 non-null   object 
 11  sector                   3909 non-null   object 
 12  revenue                  3909 non-null   object 
 13  competitors              3909 non-null   object 
 14  easy_apply              

## Data Preprocessing

### Define stopwords to append to default stopwords

In [4]:
# append special stopwords in articles to default and common stopwords
new_stopwords = STOPWORDS.union(set(['job', 'skill', 'experience', 'team', 'data', 'use', 'like', 'business', 'work', 'ability', 'let', 'example'
                'need', 'new', 'user', 'opportunity', 'candidate', 'provide', 'company', 'one', 'used', 'need', 
                'see', 'make', 'follow', 'going', 'will', 'want', 'well', 'find', 'give', 'change', 'look', 'first', 'using',
                'know', 'model', 'science', 'think', 'year', 'years', 'looking']))

### Method to preprocess data for articles

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
preprocessed_jobs_listings = cleaned_jobs_listings_df['cleaned_job_description'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
jobs_listings_dictionary = corpora.Dictionary(preprocessed_jobs_listings)

In [8]:
# Convert into the bag-of-words (BoW) format
jobs_listings_bow_corpus = [jobs_listings_dictionary.doc2bow(text) for text in preprocessed_jobs_listings]

In [9]:
# Convert into TF-IDF format
jobs_listings_tfidf = models.TfidfModel(jobs_listings_bow_corpus)
jobs_listings_tfidf_corpus = jobs_listings_tfidf[jobs_listings_bow_corpus]

## LSA Model

### Result using LSA model + Bag of words

In [10]:
jobs_listings_bow_lsamodel = LsiModel(jobs_listings_bow_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [11]:
jobs_listings_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.175*"development" + 0.160*"analysis" + 0.147*"skill" + 0.145*"technology" + 0.138*"information" + 0.137*"project" + 0.137*"solution"'),
 (1,
  '-0.248*"learning" + -0.221*"analytics" + -0.200*"machine" + 0.196*"information" + -0.162*"product" + 0.161*"laboratory" + 0.149*"research"'),
 (2,
  '-0.286*"quantum" + -0.202*"world" + -0.163*"employee" + 0.156*"analysis" + 0.152*"analytics" + 0.144*"project" + 0.139*"equivalent"'),
 (3,
  '-0.349*"equivalent" + -0.231*"education" + -0.221*"learning" + -0.190*"machine" + -0.189*"band" + -0.166*"level" + -0.148*"completed"'),
 (4,
  '0.400*"research" + 0.313*"learning" + 0.247*"machine" + 0.208*"scientist" + -0.166*"information" + 0.158*"laboratory" + -0.152*"management"'),
 (5,
  '-0.262*"development" + 0.218*"information" + 0.178*"learning" + 0.176*"employee" + -0.152*"quantum" + -0.148*"project" + 0.147*"university"'),
 (6,
  '-0.353*"research" + 0.268*"laboratory" + -0.220*"development" + 0.198*"medical" + -0.187*"project" + 0.187

### Result using LSA model + TF-IDF

In [12]:
jobs_listings_tfidf_lsamodel = LsiModel(jobs_listings_tfidf_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [13]:
jobs_listings_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.095*"learning" + 0.094*"analytics" + 0.092*"machine" + 0.083*"product" + 0.081*"client" + 0.080*"customer" + 0.079*"model"'),
 (1,
  '-0.614*"quantum" + -0.191*"qiskit" + -0.178*"progressive" + -0.111*"world" + -0.101*"numerics" + -0.101*"quantumenabled" + -0.098*"reinventing"'),
 (2,
  '0.284*"laboratory" + 0.187*"clinical" + 0.151*"cell" + -0.132*"learning" + -0.131*"machine" + 0.112*"assay" + 0.112*"scientific"'),
 (3,
  '-0.180*"locate" + -0.180*"analyze" + -0.176*"printout" + -0.171*"databasesdata" + -0.158*"qualityacquire" + -0.158*"setsfilter" + -0.157*"systemsidentify"'),
 (4,
  '0.158*"cell" + -0.148*"university" + -0.133*"security" + 0.132*"clinical" + -0.119*"clery" + 0.112*"learning" + 0.112*"machine"'),
 (5,
  '-0.727*"tutor" + -0.488*"tutoring" + -0.176*"student" + -0.131*"choose" + -0.123*"session" + -0.085*"schedule" + -0.081*"tutoringonline"'),
 (6,
  '0.143*"pipeline" + -0.132*"university" + 0.127*"cloud" + 0.127*"tutor" + 0.126*"laboratory" + 0.125*"cell" +

## LDA Model

### Result using LDA model + Bag of words

In [14]:
jobs_listings_bow_ldamodel = gensim.models.LdaMulticore(jobs_listings_bow_corpus, num_topics=10, id2word=jobs_listings_dictionary)

In [15]:
jobs_listings_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.008*"product" + 0.007*"analysis" + 0.006*"including" + 0.006*"project" + 0.005*"development" + 0.005*"service" + 0.005*"customer"'),
 (1,
  '0.006*"product" + 0.006*"development" + 0.006*"technology" + 0.005*"solution" + 0.005*"analytics" + 0.005*"support" + 0.005*"learning"'),
 (2,
  '0.006*"analysis" + 0.006*"learning" + 0.006*"analytics" + 0.005*"development" + 0.005*"skill" + 0.005*"technology" + 0.005*"design"'),
 (3,
  '0.007*"process" + 0.005*"analysis" + 0.005*"product" + 0.005*"service" + 0.005*"development" + 0.004*"skill" + 0.004*"research"'),
 (4,
  '0.005*"management" + 0.005*"technology" + 0.005*"knowledge" + 0.005*"skill" + 0.004*"development" + 0.004*"information" + 0.004*"service"'),
 (5,
  '0.007*"development" + 0.007*"analytics" + 0.006*"skill" + 0.006*"analysis" + 0.005*"product" + 0.005*"management" + 0.005*"scientist"'),
 (6,
  '0.006*"skill" + 0.005*"development" + 0.005*"analysis" + 0.005*"working" + 0.004*"project" + 0.004*"information" + 0.004*"resea

In [16]:
pyLDAvis.gensim_models.prepare(jobs_listings_bow_ldamodel, jobs_listings_bow_corpus, jobs_listings_dictionary)

### Result using LDA model + TF-IDF

In [17]:
jobs_listings_tfidf_ldamodel = gensim.models.LdaMulticore(jobs_listings_tfidf_corpus, num_topics=10, id2word=jobs_listings_dictionary)

In [18]:
jobs_listings_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"product" + 0.001*"learning" + 0.001*"analytics" + 0.001*"client" + 0.001*"machine" + 0.001*"solution" + 0.001*"clinical"'),
 (1,
  '0.001*"product" + 0.001*"learning" + 0.001*"machine" + 0.001*"analytics" + 0.001*"research" + 0.001*"clinical" + 0.001*"engineer"'),
 (2,
  '0.001*"learning" + 0.001*"machine" + 0.001*"statistical" + 0.001*"analytics" + 0.001*"product" + 0.001*"clinical" + 0.001*"database"'),
 (3,
  '0.001*"client" + 0.001*"learning" + 0.001*"analytics" + 0.001*"machine" + 0.001*"research" + 0.001*"model" + 0.001*"management"'),
 (4,
  '0.001*"analytics" + 0.001*"analyst" + 0.001*"product" + 0.001*"bull" + 0.001*"statistical" + 0.001*"client" + 0.001*"database"'),
 (5,
  '0.001*"learning" + 0.001*"machine" + 0.001*"analytics" + 0.001*"model" + 0.001*"engineer" + 0.001*"product" + 0.001*"process"'),
 (6,
  '0.001*"learning" + 0.001*"machine" + 0.001*"product" + 0.001*"customer" + 0.001*"analytics" + 0.001*"client" + 0.001*"research"'),
 (7,
  '0.001*"machine"

In [19]:
dominant_topics_df = pd.DataFrame()
for i, row in enumerate(jobs_listings_tfidf_ldamodel[jobs_listings_tfidf_corpus]):
   row = sorted(row, key=lambda x: (x[1]), reverse=True)
   for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0: # => dominant topic
            wp = jobs_listings_tfidf_ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            dominant_topics_df = dominant_topics_df.append(
                pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True
            )
        else:
          break
dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

In [20]:
dominant_topics_df["Text"] = cleaned_jobs_listings_df['cleaned_job_description']
dominant_topics_df

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,6.0,0.9251,"learning, machine, product, customer, analytic...",about at were on a mission to make booking tr...
1,6.0,0.9192,"learning, machine, product, customer, analytic...",at noom we use scientifically proven methods t...
2,6.0,0.9219,"learning, machine, product, customer, analytic...",decodemhttpswwwdecodemcomdata science manager ...
3,8.0,0.9115,"trading, research, model, statistical, analyti...",seeks a dynamic and driven midlevel data anal...
4,6.0,0.9185,"learning, machine, product, customer, analytic...",director data science 200537descriptionedelma...
...,...,...,...,...
3904,0.0,0.9097,"product, learning, analytics, client, machine,...",about us is a digital transformation consultin...
3905,2.0,0.8969,"learning, machine, statistical, analytics, pro...",job descriptioninterpret data analyze results ...
3906,7.0,0.9344,"machine, learning, analytics, product, researc...",job descriptionthe security analytics data eng...
3907,7.0,0.6572,"machine, learning, analytics, product, researc...",the security analytics data engineer will inte...


In [25]:
dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
dominant_topics_df.head(50)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
3569,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
2154,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
3233,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
113,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
2525,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
1970,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
361,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
746,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
1079,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...
1628,4.0,0.9564,"analytics, analyst, product, bull, statistical...",the us department of the treasury has a distin...


In [21]:
pyLDAvis.gensim_models.prepare(jobs_listings_tfidf_ldamodel, jobs_listings_bow_corpus, jobs_listings_dictionary)