## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


## Read cleaned data files

### Read cleaned jobs listings file and save in dataframe

In [2]:
cleaned_jobs_listings_df = pd.read_csv("cleaned_jobs_listings.csv")

### Check columns info in jobs listings dataframe

In [3]:
cleaned_jobs_listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3690 entries, 0 to 3689
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   job_title                3690 non-null   object 
 1   salary_estimate          3690 non-null   object 
 2   job_description          3690 non-null   object 
 3   rating                   3690 non-null   float64
 4   company_name             3690 non-null   object 
 5   location                 3690 non-null   object 
 6   headquarters             3690 non-null   object 
 7   size                     3690 non-null   object 
 8   founded                  3690 non-null   int64  
 9   type_of_ownership        3690 non-null   object 
 10  industry                 3690 non-null   object 
 11  sector                   3690 non-null   object 
 12  revenue                  3690 non-null   object 
 13  competitors              3690 non-null   object 
 14  easy_apply              

## Data Preprocessing

### Define stopwords to append to default stopwords

In [4]:
# append special stopwords in jobs listings to default and common stopwords
new_stopwords = STOPWORDS.union(set(['job', 'skill', 'experience', 'team', 'data', 'use', 'like', 'business', 
                                'work', 'ability', 'let', 'need', 'new', 'user', 'opportunity', 
                                'candidate', 'provide', 'company', 'one', 'used', 'need', 'see', 'make', 
                                'follow', 'going', 'will', 'want', 'well', 'find', 'give', 'change', 'look', 
                                'first', 'using','know', 'science', 'think', 'year', 'years', 'looking',
                                'including', 'working', 'scientist', 'employee', 'skills', 'scientist',
                                'knowledge', 'position', 'technology', 'computer', 'employment', 'status', 
                                'project', 'information', 'analysis', 'learning', 'gender', 'disability',
                                'cell', 'tutoring', 'clery', 'choose', 'process', 'bull']))

### Method to preprocess data for jobs listings

In [5]:
# lemmatize = words change to noun
# stemmed = words reduced to root form
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='n')

# Convert a document into a list of tokens.
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in new_stopwords and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [6]:
#preprocess the jobs listings
preprocessed_jobs_listings = cleaned_jobs_listings_df['cleaned_job_description'].fillna('').astype(str).map(preprocess)

In [7]:
# implements the concept of a Dictionary – a mapping between words and their integer ids.
jobs_listings_dictionary = corpora.Dictionary(preprocessed_jobs_listings)

In [8]:
# Convert into the bag-of-words (BoW) format
jobs_listings_bow_corpus = [jobs_listings_dictionary.doc2bow(text) for text in preprocessed_jobs_listings]

In [9]:
# Convert into TF-IDF format
jobs_listings_tfidf = models.TfidfModel(jobs_listings_bow_corpus)
jobs_listings_tfidf_corpus = jobs_listings_tfidf[jobs_listings_bow_corpus]

## Common method to find top 10 dominant topics in the jobs description

In [10]:
def get_dominant_topics(model, corpus):
    dominant_topics_df = pd.DataFrame()
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topics_df = dominant_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    dominant_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    dominant_topics_df["Text"] = cleaned_jobs_listings_df['cleaned_job_description']
    dominant_topics_df = dominant_topics_df.sort_values(by=['Perc_Contribution'], ascending=False)
    return dominant_topics_df.head(10)

## LSA Model

### Result using LSA model + Bag of words

In [11]:
jobs_listings_bow_lsamodel = LsiModel(jobs_listings_bow_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [12]:
jobs_listings_bow_lsamodel.print_topics(num_words=7)

[(0,
  '0.196*"development" + 0.164*"product" + 0.158*"analytics" + 0.152*"solution" + 0.140*"technical" + 0.138*"research" + 0.137*"support"'),
 (1,
  '0.270*"analytics" + -0.253*"research" + -0.222*"laboratory" + 0.169*"product" + -0.167*"required" + 0.165*"model" + 0.158*"machine"'),
 (2,
  '-0.295*"research" + -0.294*"product" + 0.255*"application" + -0.174*"clinical" + 0.168*"system" + -0.162*"laboratory" + 0.160*"requirement"'),
 (3,
  '-0.414*"research" + -0.268*"machine" + 0.220*"management" + 0.179*"laboratory" + 0.142*"analytics" + 0.139*"quality" + 0.137*"reporting"'),
 (4,
  '-0.501*"development" + 0.248*"analytics" + -0.227*"product" + -0.197*"design" + 0.180*"model" + -0.164*"technical" + 0.139*"medical"'),
 (5,
  '-0.448*"research" + 0.447*"product" + 0.166*"laboratory" + 0.160*"people" + -0.146*"analytics" + -0.143*"statistical" + -0.121*"model"'),
 (6,
  '0.277*"product" + -0.270*"laboratory" + -0.262*"machine" + 0.231*"analytics" + -0.171*"model" + -0.170*"service" + 

<font color = "blue">
    Base on the topic keywords extracted above plus manual checks conducted on the job listings description, 
    below are the top ten topics being deduced: <br/><br/>
    Topic 0: Product, development, analytics, research and technical skills to support stakeholders and other teams <br/>
    Topic 1: Lab research and analytics using machine learning model <br/>
    Topic 2: Research in machine learning for clinical application <br/>
    Topic 3: Build clinical solution using machine learning model for lab system <br/>
    Topic 4: Product development and analysis for cells <br/>
    Topic 5: Product research using statistical analysis and model <br/>
    Topic 6: Research lab medical service and solution and submit report <br/>
    Topic 7: Research statistical machine learning model <br/>
    Topic 8: Research and development work using machine learning model in the cell process <br/>
    Topic 9: Build machine learning model for patient care product
</font>

In [13]:
get_dominant_topics(jobs_listings_bow_lsamodel, jobs_listings_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
1889,0.0,30.3139,"development, product, analytics, solution, tec...",job descriptionjob title data analystsalary 18...
891,0.0,29.3325,"development, product, analytics, solution, tec...",at west monroe our people are our businesswe p...
788,0.0,28.1404,"development, product, analytics, solution, tec...",address111 w monroe 115 s lasallejob family g...
1485,0.0,27.8261,"development, product, analytics, solution, tec...",descriptionthe lead of cybersecurity data prot...
591,0.0,26.7302,"development, product, analytics, solution, tec...",job descriptionoperation analytics manager tr...
97,0.0,26.0457,"development, product, analytics, solution, tec...",search by keywordsearch by locationclearsend m...
838,0.0,24.7193,"development, product, analytics, solution, tec...",please make sure to read the job posting in it...
770,0.0,24.6402,"development, product, analytics, solution, tec...",address111 w monroe 115 s lasallejob family g...
471,0.0,23.3516,"development, product, analytics, solution, tec...",position descriptionposition title senior data...
3290,0.0,23.3496,"development, product, analytics, solution, tec...",we are currently seeking an avp big data and m...


<font color = "blue">
    The most dominant topic is Product, development, analytics, research and technical skills to support stakeholders and other teams. All the top 10 job listings belong to Topic 0.
    <br/><br/>
    I think using the LSA model + Bag of Words is does not produce good result. Except for Topic 0 and Topic 7, the rest of the topics look similar to each other even after many rounds of training. But the result is not entirely useless. Because most of the topics are related to cell and medical field, upon closer look I found that there are quite a number of jobs for bioinformatics data scientist which yield high minimum and maximum salary.    
<br/><br/>Base on previous EDA, this salary range is in the third quartile range and in high outlier range and the Biotech and Pharmaceutical industry is among the top 3 industry who hires the most Data Science candidates. So, this a very special Data Scientist role that can represents a huge opportunity to aspiring candidates.
</font>
<br/><br/>

### Result using LSA model + TF-IDF

In [14]:
jobs_listings_tfidf_lsamodel = LsiModel(jobs_listings_tfidf_corpus, num_topics=10, id2word = jobs_listings_dictionary)

In [15]:
jobs_listings_tfidf_lsamodel.print_topics(num_words=7)

[(0,
  '0.099*"analytics" + 0.095*"machine" + 0.091*"product" + 0.088*"model" + 0.081*"customer" + 0.080*"solution" + 0.080*"research"'),
 (1,
  '-0.287*"laboratory" + -0.190*"clinical" + -0.119*"scientific" + -0.117*"assay" + 0.111*"machine" + -0.110*"patient" + -0.109*"biology"'),
 (2,
  '-0.189*"analyze" + -0.183*"locate" + -0.176*"printout" + -0.171*"databasesdata" + -0.157*"qualityacquire" + -0.157*"setsfilter" + -0.156*"systemsidentify"'),
 (3,
  '0.616*"tutor" + 0.153*"student" + 0.136*"university" + 0.105*"session" + 0.097*"security" + -0.091*"clinical" + 0.091*"investigation"'),
 (4,
  '0.594*"tutor" + 0.142*"student" + -0.122*"university" + -0.110*"security" + 0.099*"session" + -0.096*"investigation" + -0.084*"application"'),
 (5,
  '-0.174*"laboratory" + -0.149*"pipeline" + -0.136*"cloud" + 0.134*"university" + -0.124*"spark" + -0.101*"azure" + -0.101*"kafka"'),
 (6,
  '-0.163*"machine" + -0.154*"biology" + -0.138*"assay" + 0.123*"reporting" + -0.123*"molecular" + 0.119*"ana

<font color = "blue">
    Base on the topic keywords extracted above plus manual checks conducted on the job listings description, 
    below are the top ten topics being deduced: <br/><br/>
    Topic 0: Research and analytics using ML model for customer <br/>
    Topic 1: Build ML model for clinical cell lab <br/>
    Topic 2: Analyze and manipulate data from database and reprort printouts<br/>
    Topic 3: Tutor in Data Science for students in university<br/>
    Topic 4: Data Science tutors can schedule own sessions for students<br/>
    Topic 5: Build pipeline using spark, azure and Kafka<br/>
    Topic 6: Research and analyze molecular biology<br/>
    Topic 7: Build machine learning model for patient care product<br/>
    Topic 8: Build predictive and statistical ML model for people<br/>
    Topic 9: Explore and provide insight using quantum approaches for insurance industry
</font>

In [16]:
get_dominant_topics(jobs_listings_tfidf_lsamodel, jobs_listings_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
3364,3.0,0.7122,"tutor, student, university, session, security,...",austin data science tutor jobs has students in...
1244,3.0,0.7073,"tutor, student, university, session, security,...",west university place data science tutor jobs ...
1981,3.0,0.7068,"tutor, student, university, session, security,...",san antonio data science tutor jobs has studen...
1241,3.0,0.7056,"tutor, student, university, session, security,...",houston data science tutor jobs has students i...
1480,3.0,0.7052,"tutor, student, university, session, security,...",phoenix data science tutor jobs has students i...
3684,3.0,0.7035,"tutor, student, university, session, security,...",columbus data science tutor jobs has students ...
1249,3.0,0.7029,"tutor, student, university, session, security,...",spring data science tutor jobs has students in...
3519,3.0,0.7023,"tutor, student, university, session, security,...",irving data science tutor jobs has students in...
3438,3.0,0.7007,"tutor, student, university, session, security,...",jacksonville data science tutor jobs has stude...
1510,3.0,0.7005,"tutor, student, university, session, security,...",tempe data science tutor jobs has students in ...


<font color = "blue">
    The most dominant topic is Tutor in Data Science for students in university. All the top 10 job listings belong to Topic 3.
    <br/><br/>
    I think using the LSA model + TF-IDF produce improvement compared to using LSA model + Bag of Words. But it seems the same online platform posted the same job description but for different regions and even university in the USA. I think it will be hard for NLP to detect anamoly situation or drop this kind of situation from being included as a result. I could have manually remove all these similar job description manually or increase the dataset entries (the current dataset only has 3690 entries of job descriptions) to prevent this situation or use better NLP or topic modelling algorithm instead of LSA. This online platform is not the only ones spamming the jobs portal. Other recruiters may also use similar method.
<br/><br/>
    Even though the same online platform is advertising, it also means that there is big opportunity for freelance jobs as a Data Science tutor especially for students in the university who is looking for help in to study for their Data Science modules.
</font>
<br/><br/>

## LDA Model

### Result using LDA model + Bag of words

In [17]:
jobs_listings_bow_ldamodel = gensim.models.LdaMulticore(jobs_listings_bow_corpus, num_topics=10, id2word=jobs_listings_dictionary)

In [18]:
jobs_listings_bow_ldamodel.print_topics(num_words=7)

[(0,
  '0.008*"development" + 0.006*"analytics" + 0.006*"product" + 0.005*"technical" + 0.005*"research" + 0.005*"solution" + 0.005*"tool"'),
 (1,
  '0.006*"product" + 0.006*"development" + 0.005*"solution" + 0.005*"system" + 0.005*"application" + 0.004*"database" + 0.004*"management"'),
 (2,
  '0.006*"development" + 0.006*"product" + 0.005*"solution" + 0.004*"research" + 0.004*"technical" + 0.004*"support" + 0.004*"service"'),
 (3,
  '0.006*"model" + 0.005*"product" + 0.005*"management" + 0.005*"solution" + 0.004*"development" + 0.004*"machine" + 0.004*"service"'),
 (4,
  '0.006*"analytics" + 0.005*"product" + 0.005*"service" + 0.005*"tool" + 0.005*"development" + 0.004*"management" + 0.004*"support"'),
 (5,
  '0.006*"development" + 0.006*"analytics" + 0.005*"management" + 0.004*"solution" + 0.004*"support" + 0.004*"technical" + 0.004*"research"'),
 (6,
  '0.005*"development" + 0.004*"engineering" + 0.004*"service" + 0.004*"research" + 0.004*"related" + 0.004*"analytics" + 0.004*"solu

<font color = "blue">
Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>

Topic 0: Development, analytics, technical and research skills to build product solution<br/>
Topic 1: Product and development solution for existing system and application, including database management<br/>
Topic 2: Research and development and provide technical support<br/>
Topic 3: Provide ML model as solution to management <br/>
Topic 4: Can use a variety of tools for product/service development to support management<br/>
Topic 5: Technical skills in development to support management<br/>
Topic 6: Require engineering skill for research and development in service solution<br/>
Topic 7: Research different ML models<br/>
Topic 8: Technical analysis for ML models<br/>
Topic 9: Research related ML model<br/>
</font>

In [19]:
pyLDAvis.gensim_models.prepare(jobs_listings_bow_ldamodel, jobs_listings_bow_corpus, jobs_listings_dictionary)

In [20]:
get_dominant_topics(jobs_listings_bow_ldamodel, jobs_listings_bow_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
2949,7.0,0.999,"technical, model, development, machine, resear...",energy industry transformationthe trillion dol...
669,6.0,0.9984,"development, engineering, service, research, r...",the palace acquire program offers you a perman...
879,0.0,0.9983,"development, analytics, product, technical, re...",job descriptionabout the jobthe decision scien...
3621,2.0,0.9983,"development, product, solution, research, tech...",help us transform patients livesat we put our...
2017,0.0,0.9983,"development, analytics, product, technical, re...",is a relentlessly clientfocused group who bui...
848,0.0,0.9982,"development, analytics, product, technical, re...",job descriptionabout the jobthe decision scien...
3288,5.0,0.9982,"development, analytics, management, solution, ...",we offer a competitive compensation and benefi...
2615,5.0,0.9981,"development, analytics, management, solution, ...",levelrowlevelmanagementjob locationdallas tx ...
971,4.0,0.9981,"analytics, product, service, tool, development...",job descriptionthe medical laboratory scientis...
181,0.0,0.998,"development, analytics, product, technical, re...",small teams big dataat we look for individual...


<font color = "blue">
    The most dominant topic is Topic 0: Development, analytics, technical and research skills to build product solution. All the top 10 job listings belong to Topic 0.
    <br/><br/>
    Similar to LSA model + Bag of Words, I think using the LDA model + Bag of Words does not produce good result. Most of the topics look very similar to each other even after many rounds of training. 
</font>
<br/><br/>

### Result using LDA model + TF-IDF

In [21]:
jobs_listings_tfidf_ldamodel = gensim.models.LdaMulticore(jobs_listings_tfidf_corpus, num_topics=10, id2word=jobs_listings_dictionary)

In [22]:
jobs_listings_tfidf_ldamodel.print_topics(num_words=7)

[(0,
  '0.001*"machine" + 0.001*"analytics" + 0.001*"client" + 0.001*"solution" + 0.001*"product" + 0.001*"model" + 0.001*"customer"'),
 (1,
  '0.001*"laboratory" + 0.001*"clinical" + 0.001*"analytics" + 0.001*"machine" + 0.001*"development" + 0.001*"product" + 0.001*"solution"'),
 (2,
  '0.001*"research" + 0.001*"model" + 0.001*"machine" + 0.001*"product" + 0.001*"analytics" + 0.001*"client" + 0.001*"statistical"'),
 (3,
  '0.001*"machine" + 0.001*"tutor" + 0.001*"client" + 0.001*"research" + 0.001*"software" + 0.001*"product" + 0.001*"statistical"'),
 (4,
  '0.001*"model" + 0.001*"product" + 0.001*"analytics" + 0.001*"machine" + 0.001*"software" + 0.001*"statistical" + 0.001*"client"'),
 (5,
  '0.001*"analytics" + 0.001*"research" + 0.001*"model" + 0.001*"product" + 0.001*"customer" + 0.001*"machine" + 0.001*"pipeline"'),
 (6,
  '0.001*"research" + 0.001*"product" + 0.001*"analytics" + 0.001*"machine" + 0.001*"model" + 0.001*"clinical" + 0.001*"modeling"'),
 (7,
  '0.001*"machine" + 

<font color = "blue">
Base on the topic keywords extracted above plus manual checks conducted on the job listings description, below are the top ten topics being deduced: <br/><br/>

Topic 0: Provide analytics and ML model for customers <br/>
Topic 1: Develop product solution using analytics and ML for clinical laboratory <br/>
Topic 2: Research statistical analytics <br/>
Topic 3: Statistical method and software engineering skills<br/>
Topic 4: Develop statistical and machine learning solutions<br/>
Topic 5: ETL to build ML pipeline <br/>
Topic 6: ML model for clinical products<br/>
Topic 7: Database management, statistical model to develop solution<br/>
Topic 8: Product development for client<br/>
Topic 9: Produce insight for products using ML model <br/><br/>
While the topics generated with LDA model + TF-IDF is an improvement compared when generated using Bag of Words, some of the topics are a bit similar.
</font>

In [23]:
pyLDAvis.gensim_models.prepare(jobs_listings_tfidf_ldamodel, jobs_listings_tfidf_corpus, jobs_listings_dictionary)

In [24]:
get_dominant_topics(jobs_listings_tfidf_ldamodel, jobs_listings_tfidf_corpus)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
3359,4.0,0.9506,"model, product, analytics, machine, software, ...",we are hiringdata engineer fulltime day aust...
3261,4.0,0.9503,"model, product, analytics, machine, software, ...",why join technologies leverages technology to...
3325,4.0,0.9496,"model, product, analytics, machine, software, ...",we are hiringdata engineer fulltime day aust...
1141,5.0,0.9491,"analytics, research, model, product, customer,...",work shift daywork week variesjob summarythe m...
1234,5.0,0.9485,"analytics, research, model, product, customer,...",work shift daywork week m f w occasional week...
314,0.0,0.9482,"machine, analytics, client, solution, product,...",explore the possibilities across our global ho...
97,0.0,0.9479,"machine, analytics, client, solution, product,...",search by keywordsearch by locationclearsend m...
1233,5.0,0.9478,"analytics, research, model, product, customer,...",work shift daywork week m f w occasional week...
1571,2.0,0.947,"research, model, machine, product, analytics, ...",site name usa pennsylvania upper providence ...
1208,2.0,0.9468,"research, model, machine, product, analytics, ...",about usharris health is a nationally recogniz...


<font color = "blue">
    There are two dominant topics: <br/><br/>
    Topic 4: Develop statistical and machine learning solutions <br/>
    Topic 5: ETL to build ML pipeline <br/><br/>
    Most of data scientist kind of roles that include data engineer and data analyst will involve either Topic 4 and Topic 5 related task. So this LDA model + TF-IDF encapsulates the main responsibility of data science related role.
</font>