In [1]:
import pandas as pd
import spacy
from spacy.lang.en import English
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk
import re
import gensim
from gensim import corpora
import pickle
from collections import OrderedDict
import pyLDAvis.gensim
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import os

import helper_functions as hf
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import time

In [2]:
nlp = spacy.load('en_core_web_sm')
lemmatizer=WordNetLemmatizer()

In [3]:
### Load updated stop words list
stop_words = pd.read_csv(r'..\Data\stop_words.csv')
stop_words = set(stop_words['stop_words']) 

### Load station names list
station_names = pd.read_csv(r'..\Data\station_names.csv')
station = re.compile('|'.join(map(re.escape, station_names['Station'].str.lower())))

photo_names = ['svg','png','jpeg','jpg', 'photo','pictures','picture','photos']
photo = re.compile('|'.join(map(re.escape, photo_names)))   

### Load mallet package
os.environ['MALLET_HOME'] = r'..\\Models\\mallet-2.0.8' # update this path
mallet_path = r'..\\Models\\mallet-2.0.8\\bin\\mallet' # update this path

In [4]:
reddit_df = pd.read_csv(r'..\Data\reddit_data_raw.csv')

In [5]:
print("reddit data :", reddit_df.shape[0])
working_fraction=0.4
reddit_df=reddit_df.sample(frac=working_fraction)
print("reddit data :", reddit_df.shape[0])

reddit data : 35085
reddit data : 14034


In [6]:
jobs=8
mp_instance=hf.mp_tokenize(df=reddit_df,
                           target_column='body', 
                           stop_words=stop_words,
                           station=station, 
                           photo=photo, 
                           nlp=nlp,
                           jobs=jobs)

In [7]:
start_time = time.time()
processed_list=mp_instance.excecute()
end_time = time.time() - start_time
print("{0} tokenized in {1} sec with {2} threads".format(reddit_df.shape[0], end_time, jobs))

14034 tokenized in 126.67205238342285 sec with 8 threads


In [8]:
text_data = [i[1] for i in processed_list]

In [9]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [10]:
model_name="ldamallet" #"ldamodel"
max_evals=100
gensim_optimizer=hf.gensim_optimizer(model_name="ldamallet", 
                                     model_path=mallet_path,
                                     dictionary=dictionary, 
                                     corpus=corpus, 
                                     texts=text_data, 
                                     max_evals=max_evals)
trials, best_model, best_score= gensim_optimizer.exceute()

100%|██████████| 100/100 [2:21:51<00:00, 79.72s/it, best loss: -0.5211239729933473]  


In [11]:
losses = [trials.trials[i]['result']['loss'] for i in range(len(trials.trials))]
params = pd.DataFrame(trials.vals)
params['loss'] = losses
params.sort_values('loss', inplace=True)
params.head(10)

Unnamed: 0,alpha,num_topics,topic_threshold,loss
21,76.0,12.0,0.356735,-0.521124
70,80.0,11.0,0.619201,-0.515409
12,16.0,9.0,0.727374,-0.511046
34,55.0,17.0,0.457233,-0.508285
99,123.0,10.0,0.830517,-0.503549
73,117.0,5.0,0.731032,-0.500623
57,23.0,13.0,0.768,-0.500333
11,28.0,5.0,0.282597,-0.498431
61,58.0,5.0,0.673723,-0.496219
66,74.0,17.0,0.376585,-0.494042


In [12]:
params.to_csv("hyper_parameters.csv",index=False)

In [13]:
topics = best_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.077*"people" + 0.028*"happen" + 0.023*"start" + 0.020*"feel" + 0.016*"fuck"')
(1, '0.068*"work" + 0.024*"job" + 0.022*"give" + 0.018*"call" + 0.016*"life"')
(2, '0.065*"car" + 0.054*"stop" + 0.045*"drive" + 0.030*"streetcar" + 0.027*"driver"')
(3, '0.055*"presto" + 0.043*"fare" + 0.037*"card" + 0.033*"system" + 0.023*"pass"')
(4, '0.082*"people" + 0.078*"pay" + 0.046*"cost" + 0.043*"money" + 0.028*"increase"')
(5, '0.036*"government" + 0.027*"ontario" + 0.023*"province" + 0.022*"cut" + 0.020*"fund"')
(6, '0.053*"live" + 0.042*"build" + 0.038*"area" + 0.033*"place" + 0.030*"downtown"')
(7, '0.064*"urllink" + 0.044*"plan" + 0.034*"ford" + 0.014*"vote" + 0.012*"free"')
(8, '0.208*"transit" + 0.114*"city" + 0.098*"toronto" + 0.050*"public" + 0.033*"system"')
(9, '0.182*"ttc" + 0.086*"bus" + 0.041*"hour" + 0.035*"service" + 0.030*"run"')
(10, '0.032*"point" + 0.027*"find" + 0.017*"issue" + 0.014*"case" + 0.013*"number"')
(11, '0.253*"ttcstation" + 0.134*"subway" + 0.061*"train" + 0.0

In [None]:
lda = gensim.models.ldamodel.LdaModel.load('..\Models\model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
#ldamodel.save('..\Models\model5.gensim')
#pyLDAvis.save_html(lda_display, '..\Visualisations\5 topics.html')