## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned jobs listings file and save in dataframe

In [2]:
cleaned_jobs_listings_df = pd.read_csv("cleaned_jobs_listings.csv")

### Check columns info in jobs listings dataframe

In [3]:
cleaned_jobs_listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3690 entries, 0 to 3689
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   job_title                3690 non-null   object 
 1   salary_estimate          3690 non-null   object 
 2   job_description          3690 non-null   object 
 3   rating                   3690 non-null   float64
 4   company_name             3690 non-null   object 
 5   location                 3690 non-null   object 
 6   headquarters             3690 non-null   object 
 7   size                     3690 non-null   object 
 8   founded                  3690 non-null   int64  
 9   type_of_ownership        3690 non-null   object 
 10  industry                 3690 non-null   object 
 11  sector                   3690 non-null   object 
 12  revenue                  3690 non-null   object 
 13  competitors              3690 non-null   object 
 14  easy_apply              

## Data Preprocessing

### Define stopwords to append to default stopwords

In [4]:
# append special stopwords in jobs listings to default and common stopwords
new_stopwords = STOPWORDS.union(set(['job', 'skill', 'experience', 'team', 'data', 'use', 'like', 'business', 
                                'work', 'ability', 'let', 'need', 'new', 'user', 'opportunity', 
                                'candidate', 'provide', 'company', 'one', 'used', 'need', 'see', 'make', 
                                'follow', 'going', 'will', 'want', 'well', 'find', 'give', 'change', 'look', 
                                'first', 'using','know', 'science', 'think', 'year', 'years', 'looking',
                                'including', 'working', 'scientist', 'employee', 'skills', 'scientist',
                                'knowledge', 'position', 'technology', 'computer', 'employment', 'status', 
                                'project', 'information', 'analysis', 'learning', 'gender', 'disability']))

### Method to preprocess data for jobs listings

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the jobs listings
preprocessed_jobs_listings = cleaned_jobs_listings_df['cleaned_job_description'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
jobs_listings_dictionary = corpora.Dictionary(preprocessed_jobs_listings)

In [8]:
# Convert into the bag-of-words (BoW) format
jobs_listings_bow_corpus = [jobs_listings_dictionary.doc2bow(text) for text in preprocessed_jobs_listings]

In [9]:
# Convert into TF-IDF format
jobs_listings_tfidf = models.TfidfModel(jobs_listings_bow_corpus)
jobs_listings_tfidf_corpus = jobs_listings_tfidf[jobs_listings_bow_corpus]

## Common method to find top 10 dominant topics in the jobs description

In [10]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Text"] = cleaned_jobs_listings_df['cleaned_job_description']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
jobs_listings_bow_lsamodel = LsiModel(jobs_listings_bow_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [12]:
jobs_listings_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.196*"development" + 0.163*"product" + 0.156*"analytics" + 0.151*"solution" + 0.139*"technical" + 0.137*"research" + 0.137*"support"'),
 (1,
  '0.271*"analytics" + -0.249*"research" + -0.220*"laboratory" + 0.167*"model" + 0.165*"product" + -0.165*"required" + 0.160*"machine"'),
 (2,
  '0.300*"product" + 0.263*"research" + -0.260*"application" + 0.173*"clinical" + -0.163*"system" + 0.155*"laboratory" + -0.154*"requirement"'),
 (3,
  '0.429*"research" + 0.285*"machine" + -0.227*"process" + -0.213*"management" + 0.168*"model" + -0.142*"development" + -0.133*"support"'),
 (4,
  '-0.463*"development" + 0.238*"analytics" + -0.206*"product" + 0.197*"laboratory" + 0.185*"medical" + -0.181*"cell" + -0.178*"process"'),
 (5,
  '0.451*"product" + -0.447*"research" + 0.165*"laboratory" + 0.162*"people" + -0.139*"statistical" + -0.139*"analytics" + -0.119*"model"'),
 (6,
  '0.260*"laboratory" + -0.256*"analytics" + -0.241*"product" + 0.232*"machine" + 0.171*"service" + 0.167*"solution" + -0

<font color = "blue">
    Topic 0: product and development analytics and technical skills <br/>
    Topic 1: Lab research using machine learning model <br/>
    Topic 2: Research in machine learning <br/>
    Topic 3: Build clinical solution for lab <br/>
    Topic 4: Product development and analysis for cells<br/>
    Topic 5: Product research <br/>
    Topic 6: Research lab medical services <br/>
    Topic 7: Using statistical machine learning model <br/>
    Topic 8: Research and development work in the cell process <br/>
    Topic 9: Build machine learning model for patient care
</font>

In [13]:
get_dominant_topics(jobs_listings_bow_lsamodel, jobs_listings_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
1889,0.0,30.5536,"development, product, analytics, solution, tec...",job descriptionjob title data analystsalary 18...
891,0.0,29.1084,"development, product, analytics, solution, tec...",at west monroe our people are our businesswe p...
1485,0.0,28.499,"development, product, analytics, solution, tec...",descriptionthe lead of cybersecurity data prot...
788,0.0,28.3594,"development, product, analytics, solution, tec...",address111 w monroe 115 s lasallejob family g...
591,0.0,27.1435,"development, product, analytics, solution, tec...",job descriptionoperation analytics manager tr...
97,0.0,25.8386,"development, product, analytics, solution, tec...",search by keywordsearch by locationclearsend m...
770,0.0,24.8855,"development, product, analytics, solution, tec...",address111 w monroe 115 s lasallejob family g...
838,0.0,24.8775,"development, product, analytics, solution, tec...",please make sure to read the job posting in it...
471,0.0,23.6962,"development, product, analytics, solution, tec...",position descriptionposition title senior data...
3290,0.0,23.6345,"development, product, analytics, solution, tec...",we are currently seeking an avp big data and m...


<font color = "blue">
    The most dominant topic is related to having product and development technical skills and analytic skills
</font>

### Result using LSA model + TF-IDF

In [14]:
jobs_listings_tfidf_lsamodel = LsiModel(jobs_listings_tfidf_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [15]:
jobs_listings_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.099*"analytics" + 0.095*"machine" + 0.090*"product" + 0.088*"model" + 0.081*"customer" + 0.080*"solution" + 0.080*"research"'),
 (1,
  '0.282*"laboratory" + 0.187*"clinical" + 0.171*"cell" + 0.125*"assay" + 0.120*"scientific" + 0.118*"biology" + -0.110*"machine"'),
 (2,
  '-0.189*"analyze" + -0.183*"locate" + -0.176*"printout" + -0.171*"databasesdata" + -0.157*"qualityacquire" + -0.157*"setsfilter" + -0.156*"disseminate"'),
 (3,
  '-0.490*"tutor" + -0.329*"tutoring" + -0.149*"university" + 0.135*"cell" + -0.128*"student" + -0.113*"clery" + -0.103*"security"'),
 (4,
  '-0.547*"tutor" + -0.367*"tutoring" + -0.127*"student" + 0.125*"university" + 0.104*"security" + 0.103*"clery" + -0.100*"choose"'),
 (5,
  '0.159*"pipeline" + 0.149*"cloud" + 0.135*"spark" + 0.128*"laboratory" + 0.125*"bull" + 0.115*"azure" + 0.111*"cell"'),
 (6,
  '-0.262*"cell" + -0.150*"biology" + -0.137*"assay" + -0.131*"machine" + 0.129*"laboratory" + 0.121*"hospital" + 0.115*"care"'),
 (7,
  '0.266*"laborat

In [16]:
get_dominant_topics(jobs_listings_tfidf_lsamodel, jobs_listings_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
1202,7.0,0.4891,"laboratory, machine, cell, hospital, patient, ...",work shift daywork week variesjob summarythe m...
1210,7.0,0.4886,"laboratory, machine, cell, hospital, patient, ...",work shift eveningwork week variesjob summaryt...
1201,7.0,0.4883,"laboratory, machine, cell, hospital, patient, ...",work shift nightwork week variesjob summarythe...
1197,7.0,0.488,"laboratory, machine, cell, hospital, patient, ...",work shift thirdwork week variesjob summary082...
1118,7.0,0.4878,"laboratory, machine, cell, hospital, patient, ...",work shift daywork week variesjob summarythe m...
1183,7.0,0.4841,"laboratory, machine, cell, hospital, patient, ...",work shift daywork week m fjob summary022019j...
1222,7.0,0.4712,"laboratory, machine, cell, hospital, patient, ...",work shift nightwork week variesjob summaryjob...
3210,4.0,0.4648,"tutor, tutoring, student, university, security...",job posting titleinformation systems engineeri...
1141,7.0,0.4636,"laboratory, machine, cell, hospital, patient, ...",work shift daywork week variesjob summarythe m...
3193,4.0,0.4615,"tutor, tutoring, student, university, security...",location pickle research campusjob posting tit...


## LDA Model

### Result using LDA model + Bag of words

In [17]:
jobs_listings_bow_ldamodel = gensim.models.LdaMulticore(jobs_listings_bow_corpus, num_topics=10, id2word=jobs_listings_dictionary)

In [18]:
jobs_listings_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.006*"development" + 0.005*"product" + 0.005*"research" + 0.004*"technical" + 0.004*"analytics" + 0.004*"degree" + 0.004*"related"'),
 (1,
  '0.005*"solution" + 0.005*"development" + 0.005*"analytics" + 0.005*"model" + 0.004*"support" + 0.004*"management" + 0.004*"product"'),
 (2,
  '0.007*"development" + 0.006*"product" + 0.005*"tool" + 0.005*"design" + 0.005*"solution" + 0.004*"management" + 0.004*"customer"'),
 (3,
  '0.006*"product" + 0.006*"technical" + 0.006*"development" + 0.005*"support" + 0.005*"process" + 0.004*"solution" + 0.004*"tool"'),
 (4,
  '0.007*"research" + 0.005*"development" + 0.005*"machine" + 0.005*"analytics" + 0.005*"product" + 0.005*"model" + 0.005*"technical"'),
 (5,
  '0.006*"development" + 0.005*"analytics" + 0.005*"product" + 0.005*"research" + 0.004*"management" + 0.004*"solution" + 0.004*"support"'),
 (6,
  '0.007*"development" + 0.005*"product" + 0.004*"machine" + 0.004*"service" + 0.004*"engineering" + 0.004*"support" + 0.004*"customer"'),
 (7

In [19]:
pyLDAvis.gensim_models.prepare(jobs_listings_bow_ldamodel, jobs_listings_bow_corpus, jobs_listings_dictionary)

In [20]:
get_dominant_topics(jobs_listings_bow_ldamodel, jobs_listings_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
3321,7.0,0.9986,"product, development, engineering, research, s...",job posting titlesystem test engineering scien...
3206,7.0,0.9985,"product, development, engineering, research, s...",location pickle research campusjob posting tit...
3276,7.0,0.9985,"product, development, engineering, research, s...",location pickle research campusjob posting tit...
3340,7.0,0.9985,"product, development, engineering, research, s...",job posting titleelectronics engineering scien...
479,5.0,0.9985,"development, analytics, product, research, man...",positionprogram informationdepartment of healt...
3262,7.0,0.9983,"product, development, engineering, research, s...",location pickle research campusjob posting tit...
3352,7.0,0.9983,"product, development, engineering, research, s...",job posting titlesoftware design engineering s...
3366,7.0,0.9983,"product, development, engineering, research, s...",job posting titlesignal processing and automat...
438,4.0,0.9982,"research, development, machine, analytics, pro...",requisition id 46940all locations el segundo c...
3210,7.0,0.9982,"product, development, engineering, research, s...",job posting titleinformation systems engineeri...


### Result using LDA model + TF-IDF

In [21]:
jobs_listings_tfidf_ldamodel = gensim.models.LdaMulticore(jobs_listings_tfidf_corpus, num_topics=10, id2word=jobs_listings_dictionary)

In [22]:
jobs_listings_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"analytics" + 0.001*"product" + 0.001*"client" + 0.001*"machine" + 0.001*"database" + 0.001*"technical" + 0.001*"model"'),
 (1,
  '0.001*"analytics" + 0.001*"machine" + 0.001*"solution" + 0.001*"model" + 0.001*"statistical" + 0.001*"product" + 0.001*"research"'),
 (2,
  '0.001*"customer" + 0.001*"machine" + 0.001*"analytics" + 0.001*"model" + 0.001*"reporting" + 0.001*"product" + 0.001*"report"'),
 (3,
  '0.001*"research" + 0.001*"machine" + 0.001*"tutor" + 0.001*"product" + 0.001*"customer" + 0.001*"model" + 0.001*"analytics"'),
 (4,
  '0.001*"laboratory" + 0.001*"statistical" + 0.001*"clinical" + 0.001*"client" + 0.001*"database" + 0.001*"product" + 0.001*"machine"'),
 (5,
  '0.001*"analytics" + 0.001*"machine" + 0.001*"model" + 0.001*"solution" + 0.001*"client" + 0.001*"research" + 0.001*"product"'),
 (6,
  '0.001*"model" + 0.001*"machine" + 0.001*"product" + 0.001*"solution" + 0.001*"analytics" + 0.001*"engineer" + 0.001*"research"'),
 (7,
  '0.001*"analytics" + 0.001

In [23]:
pyLDAvis.gensim_models.prepare(jobs_listings_tfidf_ldamodel, jobs_listings_tfidf_corpus, jobs_listings_dictionary)

In [24]:
get_dominant_topics(jobs_listings_tfidf_ldamodel, jobs_listings_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
3359,0.0,0.9509,"analytics, product, client, machine, database,...",we are hiringdata engineer fulltime day aust...
3261,0.0,0.9498,"analytics, product, client, machine, database,...",why join technologies leverages technology to...
3325,0.0,0.9498,"analytics, product, client, machine, database,...",we are hiringdata engineer fulltime day aust...
874,8.0,0.9494,"machine, analytics, research, product, model, ...",please make sure to read the job posting in it...
3063,8.0,0.9482,"machine, analytics, research, product, model, ...",about usat we have our eyes set on an ambitio...
3276,6.0,0.9473,"model, machine, product, solution, analytics, ...",location pickle research campusjob posting tit...
3287,6.0,0.9472,"model, machine, product, solution, analytics, ...",location pickle research campusjob posting tit...
2803,0.0,0.9471,"analytics, product, client, machine, database,...",who we are is a global technology company with...
1050,5.0,0.9469,"analytics, machine, model, solution, client, r...",who is is a leading transformative it authori...
1141,9.0,0.9463,"laboratory, product, machine, research, analyt...",work shift daywork week variesjob summarythe m...
