In [1]:
# from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#df = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/jobpostings_anonymized_test.csv")
df = pd.read_csv('/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/sentences/sentences_emscad.csv')

  and should_run_async(code)


In [4]:
df = df['sentence'].values.tolist()

# Remove Emails
df = [re.sub('\S*@\S*\s?', '', sent) for sent in df]

# Remove new line characters
df = [re.sub('\s+', ' ', sent) for sent in df]

# Remove distracting single quotes
df = [re.sub("\'", "", sent) for sent in df]

  and should_run_async(code)
  df = [re.sub('\S*@\S*\s?', '', sent) for sent in df]
  df = [re.sub('\s+', ' ', sent) for sent in df]


In [5]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

  and should_run_async(code)


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df))

print(data_words[:1])

  and should_run_async(code)


[['food', 'fast', 'growing', 'james', 'beard', 'award', 'winning', 'online', 'food', 'community', 'and', 'crowd', 'sourced', 'and', 'curated', 'recipe', 'hub', 'is', 'currently', 'interviewing', 'full', 'and', 'part', 'time', 'unpaid', 'interns', 'to', 'work', 'in', 'small', 'team', 'of', 'editors', 'executives', 'and', 'developers', 'in', 'its', 'new', 'york', 'city', 'headquarters']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

  and should_run_async(code)


['food', 'fast', 'growing', 'james_beard_award_winning', 'online', 'food', 'community', 'and', 'crowd_sourced', 'and', 'curated_recipe_hub', 'is', 'currently', 'interviewing', 'full', 'and', 'part', 'time', 'unpaid', 'interns', 'to', 'work', 'in', 'small', 'team', 'of', 'editors', 'executives', 'and', 'developers', 'in', 'its', 'new', 'york_city', 'headquarters']


In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

  and should_run_async(code)


In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

  and should_run_async(code)


[['food', 'fast', 'grow', 'award_winne', 'online', 'food', 'community', 'crowd_source', 'hub', 'currently', 'interview', 'full', 'part', 'time', 'unpaid', 'intern', 'work', 'small', 'team', 'editor', 'executive', 'developer', 'headquarters']]


In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

  and should_run_async(code)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]]


In [11]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=10, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

print(lda_model.print_topics())
doc_lda = lda_model[corpus]

  and should_run_async(code)


[(0, '0.056*"project" + 0.053*"company" + 0.042*"look" + 0.037*"candidate" + 0.036*"system" + 0.034*"manage" + 0.030*"high" + 0.027*"requirement" + 0.020*"quality" + 0.018*"key"'), (1, '0.101*"ability" + 0.054*"software" + 0.045*"user" + 0.029*"issue" + 0.028*"perform" + 0.027*"platform" + 0.024*"problem" + 0.023*"prove" + 0.023*"task" + 0.022*"day"'), (2, '0.081*"require" + 0.069*"support" + 0.048*"position" + 0.046*"maintain" + 0.045*"base" + 0.033*"degree" + 0.033*"datum" + 0.027*"training" + 0.026*"prefer" + 0.024*"apply"'), (3, '0.063*"help" + 0.033*"individual" + 0.031*"employee" + 0.030*"office" + 0.029*"mobile" + 0.028*"analysis" + 0.023*"designer" + 0.023*"prepare" + 0.022*"startup" + 0.022*"hour"'), (4, '0.138*"team" + 0.048*"time" + 0.033*"grow" + 0.028*"report" + 0.027*"join" + 0.025*"full" + 0.022*"learn" + 0.018*"start" + 0.017*"database" + 0.017*"relate"'), (5, '0.116*"experience" + 0.101*"work" + 0.060*"customer" + 0.056*"service" + 0.033*"provide" + 0.033*"year" + 0.02

In [12]:
lda_model.show_topics(num_topics=10, num_words=10, log=False, formatted=True)

  and should_run_async(code)


[(0,
  '0.056*"project" + 0.053*"company" + 0.042*"look" + 0.037*"candidate" + 0.036*"system" + 0.034*"manage" + 0.030*"high" + 0.027*"requirement" + 0.020*"quality" + 0.018*"key"'),
 (1,
  '0.101*"ability" + 0.054*"software" + 0.045*"user" + 0.029*"issue" + 0.028*"perform" + 0.027*"platform" + 0.024*"problem" + 0.023*"prove" + 0.023*"task" + 0.022*"day"'),
 (2,
  '0.081*"require" + 0.069*"support" + 0.048*"position" + 0.046*"maintain" + 0.045*"base" + 0.033*"degree" + 0.033*"datum" + 0.027*"training" + 0.026*"prefer" + 0.024*"apply"'),
 (3,
  '0.063*"help" + 0.033*"individual" + 0.031*"employee" + 0.030*"office" + 0.029*"mobile" + 0.028*"analysis" + 0.023*"designer" + 0.023*"prepare" + 0.022*"startup" + 0.022*"hour"'),
 (4,
  '0.138*"team" + 0.048*"time" + 0.033*"grow" + 0.028*"report" + 0.027*"join" + 0.025*"full" + 0.022*"learn" + 0.018*"start" + 0.017*"database" + 0.017*"relate"'),
 (5,
  '0.116*"experience" + 0.101*"work" + 0.060*"customer" + 0.056*"service" + 0.033*"provide" + 0.

In [13]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda_model.print_topics())

[   (   0,
        '0.056*"project" + 0.053*"company" + 0.042*"look" + 0.037*"candidate" '
        '+ 0.036*"system" + 0.034*"manage" + 0.030*"high" + '
        '0.027*"requirement" + 0.020*"quality" + 0.018*"key"'),
    (   1,
        '0.101*"ability" + 0.054*"software" + 0.045*"user" + 0.029*"issue" + '
        '0.028*"perform" + 0.027*"platform" + 0.024*"problem" + 0.023*"prove" '
        '+ 0.023*"task" + 0.022*"day"'),
    (   2,
        '0.081*"require" + 0.069*"support" + 0.048*"position" + '
        '0.046*"maintain" + 0.045*"base" + 0.033*"degree" + 0.033*"datum" + '
        '0.027*"training" + 0.026*"prefer" + 0.024*"apply"'),
    (   3,
        '0.063*"help" + 0.033*"individual" + 0.031*"employee" + 0.030*"office" '
        '+ 0.029*"mobile" + 0.028*"analysis" + 0.023*"designer" + '
        '0.023*"prepare" + 0.022*"startup" + 0.022*"hour"'),
    (   4,
        '0.138*"team" + 0.048*"time" + 0.033*"grow" + 0.028*"report" + '
        '0.027*"join" + 0.025*"full" + 0.022*"lear

  and should_run_async(code)


In [14]:
# %%time
# kvalues = []
# perplexity = []
# coherence = []
# # Build LDA model
# kvalues.append(k)
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=k, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)

# # Compute Perplexity
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
# perplexity.append(lda_model.log_perplexity(corpus))

# # Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)
# coherence.append(coherence_lda)

# print(k, ' is finished.')

  and should_run_async(code)


In [15]:
#20 topics has the highest coherence score. Maybe try even more topics as the score only increased?

  and should_run_async(code)


In [16]:
# Print the Keyword in the 10 topics
#print(lda_model.print_topics())
#doc_lda = lda_model[corpus]

  and should_run_async(code)


In [17]:
# # Compute Perplexity
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# # Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)


In [18]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)


In [19]:
hm = np.array([[y for (x,y) in lda_model[corpus[i]]] for i in range(len(corpus))])


  and should_run_async(code)


ValueError: too many values to unpack (expected 2)

In [109]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors

# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)



  and should_run_async(code)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 546 samples in 0.000s...
[t-SNE] Computed neighbors for 546 samples in 0.015s...
[t-SNE] Computed conditional probabilities for sample 546 / 546
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 49.158073
[t-SNE] KL divergence after 1000 iterations: -0.375371


In [135]:
test = pd.DataFrame(zip(tsne_lda[:,0], tsne_lda[:,1],mycolors[topic_num], topic_num), columns=['x', 'y', 'colors', 'topic'])
test.topic[test.topic==1] = 101
test.topic[test.topic==2] = 102
test.topic[test.topic==3] = 103
test.topic[test.topic==4] = 104
test.topic[test.topic==7] = 107
test.topic[test.topic==0] = 1
test.topic[test.topic==5] = 2
test.topic[test.topic==6] = 3
test.topic[test.topic==8] = 4
test.topic[test.topic==9] = 5
source = ColumnDataSource(test)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 5
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)


#plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num], )
plot.scatter('x', 'y', color='colors', legend_group='topic', source=source)

show(plot)

  and should_run_async(code)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.topic[test.topic==1] = 101
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.topic[test.topic==2] = 102
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.topic[test.topic==3] = 103
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.topic[test.topic==4] = 104
A value is 

In [None]:
0 = 1
5 = 2
6 = 3 
8 = 4
9 = 5

In [134]:
test.topic[test.topic==5] = 101
test

  and should_run_async(code)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.topic[test.topic==5] = 101


Unnamed: 0,x,y,colors,topic
0,32.517124,2.876311,#8c564b,101
1,-41.846146,26.065390,#17becf,9
2,-42.685997,24.100712,#17becf,9
3,20.179747,67.825050,#e377c2,6
4,-59.179291,100.325211,#bcbd22,8
...,...,...,...,...
541,-1.473694,27.603361,#17becf,9
542,-50.876007,-15.140911,#17becf,9
543,-0.536225,23.093500,#17becf,9
544,-50.053154,12.895881,#17becf,9


In [115]:
test[test.topic==1] = 101
test[test.topic==2] = 102
test[test.topic==3] = 103
test[test.topic==4] = 104
test[test.topic==7] = 107
test[test.topic==0] = 1
test[test.topic==5] = 2
test[test.topic==6] = 3
test[test.topic==8] = 4
test[test.topic==9] = 5

  and should_run_async(code)


In [118]:
from bokeh.models import ColumnDataSource
data = {'x_values': test['x'],
        'y_values': test['y'],
       'colors' : test['colors'],
       'legend' : test['topic']}

source = ColumnDataSource(data=data)

  and should_run_async(code)


In [61]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=7, angle=.99, init='pca')
# 13-D -> 2-D
tsne_lda = tsne_model.fit_transform(arr) # doc_topic is document-topic matrix from LDA or GuidedLDA 

  and should_run_async(code)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 546 samples in 0.001s...
[t-SNE] Computed neighbors for 546 samples in 0.017s...
[t-SNE] Computed conditional probabilities for sample 546 / 546
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.024807
[t-SNE] KL divergence after 1000 iterations: -0.258912
