In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

# Load Libraries

In [159]:
from sklearn.decomposition import PCA 
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt 

from src.features import build_features

import logging 
from pathlib import Path

import scipy
import pickle
import datetime
import numpy as np

logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import Sparse2Corpus
from gensim.corpora import Dictionary
from gensim import corpora
import gensim
from gensim.utils import simple_preprocess
from gensim import models

import pandas as pd

import re

from nltk.corpus import stopwords
from src.data.make_dataset import lematize

from sklearn.decomposition import TruncatedSVD

import pyLDAvis
import pyLDAvis.gensim 

from src.data.DBConnection import DBConnection
db = DBConnection()

In [4]:
logging.getLogger().setLevel(logging.ERROR)

  and should_run_async(code)


# Build Features

In [5]:
sql = "SELECT details FROM positions;"
query = db.cur.execute(sql)
documents = query.fetchall()

  and should_run_async(code)


In [6]:
def process_documents(documents):
    # try new preprocessing steps:
    data = list(documents)
    # remove \n
    data = [text[0].replace("\n", " ") for text in data]
    # remove https:// links 
    data = [re.sub(r"((https{0,1}\:/\/\w{3}\S+)|(w{3}\S+))", "", text) for text in data]
    # remove emails:
    data = [re.sub(r"(\S+@\S+)", "", text) for text in data]
    # remove phone numbers:
    data = [re.sub(r"([\+\s01-]*\(*\d{3}\)*[-\s]\d{3}[-\s]\d{4})", "", text) for text in data]
    # remove uf0b7
    data = [re.sub(r"(\uf0b7)", "", text) for text in data]
    # remove handles:
    data = [re.sub(r"(\@\S+)", "", text) for text in data]
    # remove french documents
    data = [doc for doc in data if "une" not in doc and "connaissance" not in doc]
    return data

def docs_to_words(documents):
    for doc in documents:
        yield(gensim.utils.simple_preprocess(str(doc), deacc=True))  # deacc=True removes punctuations
        
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

  and should_run_async(code)


In [270]:
# get stop words and add some super-common words amoung the documents
stop_words = stopwords.words('english')
stop_words.extend(["yelp", "agoda"])


# clean all the documents (i.e., use regex to remove emails, urls, phone numbers, french documents, etc.)
clean_docs = process_documents(documents)

# use simple_preprocess to remove punctuation:
data_words = list(docs_to_words(clean_docs))

In [None]:
%%time
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=35) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[processed_corpus], threshold=20)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Remove Stop Words
data_words_nostops = remove_stopwords(processed_corpus)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form trigrams:
data_words_trigrams = make_trigrams(data_words_bigrams)

# Lematize docs:
data_lemmatized = [lematize(words).split(" ") for words in data_words_trigrams]

# Create Dictionary
id2word = Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Bag of words:
print('Total Vocabulary Size Pre Filter:', len(id2word))
id2word.filter_extremes(no_below=5, no_above=0.7)
print('Total Vocabulary Size Post Filter:', len(id2word))

corpus = [id2word.doc2bow(text) for text in texts]

# TF-IDF matrix:
tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

In [None]:
%%time
# determine optimal number of topics using coherence score:
coherence_vals = []
model_list =[]
start = 2
stop = 10
step = 1

for num_topics in range(start, stop, step):
    # make LDa model and calc coherence:
    model = LdaModel(corpus=tfidf_corpus, id2word=id2word, num_topics=num_topics, passes=5, iterations=100)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v', processes=4)
    coherence_vals.append(coherence_model.get_coherence())
    
x = range(start, stop, step)
plt.plot(x, coherence_vals)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.ylim([0, 1]);

In [None]:
best_model = model_list[coherence_vals.index(max(coherence_vals))]
print(f"Best model coherence score is {max(coherence_vals):0.3f}")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(best_model, tfidf_corpus, id2word, sort_topics=False)
vis

In [None]:
# display topics:
topics = best_model.print_topics(num_words=10)
topics_df = pd.DataFrame(topics)
topics_df = topics_df[1].str.replace("\\s\\+", "", regex=True).str.replace('\\"', "").str.split(" ", expand=True).T
topics_df.columns = [f"topic_{t+1}" for t in topics_df.columns]

doc_topics = np.argmax(best_model.get_topics(), axis=0)
plt.bar(height=np.bincount(doc_topics), x=list(range(1, best_model.num_topics+1)));
    
topics_df

# Lessons Learned

- Using TF-IDF versus just TF increased coherence score and seemed to increase interpretability in topics
- Using bi- and tr- grams improved coherence score, as well as tuning the threshold parameters
- Increasing LDA passes > 4 increases coherence and interpretability, but passes = 10 seems to overfit the data and the topics end up being specific to companies (i.e., topic words are bell, microsoft, rogers, etc.
- Coherence score cannot be relied on completely, sometimes the topics are not great, or the best model may not include a topic I know is there (i.e., Civil Engineering, which I purposely put in, around 11% of total dataset).

# Observations

Improved model coherence score and subjective results by using TF-IDF, decreasing the bi- and tri-gram model parameters, increasing passes and iterations of LDA model. LDA model seems to be able to tease out the major topics, which seem possible:<br>
1. Accountant
2. Food/Restaurant Work
3. Management Consulting (Deloitte?)
4. Health Related Research (this may be a slightly unstable category, probably related to Data Scientist)
5. Geotechnical/Civil Engineer
6. Data Scientist
7. Civil Engineer
<br>
It appears that this model was reasonably successful, there are some clear topics but at the same time it would appear that some of the topics overlap.

<br><br>
Further Research<br>
Look into whether the job title matches the topics produced using the LDA model.