In [1]:
import pandas as pd
import spacy
from spacy.lang.en import English
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk
import re
import gensim
from gensim import corpora
import pickle
from collections import OrderedDict
import pyLDAvis.gensim
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

import helper_functions as hf
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import time

nlp = spacy.load('en_core_web_sm')

lemmatizer=WordNetLemmatizer()

### Load updated stop words list
stop_words = pd.read_csv(r'..\Data\stop_words.csv')
stop_words = set(stop_words['stop_words']) 

### Load station names list
station_names = pd.read_csv(r'..\Data\station_names.csv')
station = re.compile('|'.join(map(re.escape, station_names['Station'].str.lower())))

photo_names = ['svg','png','jpeg','jpg', 'photo','pictures','picture','photos']
photo = re.compile('|'.join(map(re.escape, photo_names)))         

In [2]:
reddit_df = pd.read_csv(r'..\Data\reddit_data_raw.csv')
reddit_df.columns

Index(['title', 'score', 'id', 'url', 'comms_num', 'created', 'body'], dtype='object')

In [3]:
print("reddit data :", reddit_df.shape[0])

reddit data : 35085


In [4]:
working_fraction=0.4
reddit_df=reddit_df.sample(frac=working_fraction)

In [5]:
print("reddit data :", reddit_df.shape[0])

reddit data : 14034


In [6]:
jobs=8
mp_instance=hf.mp_tokenize(df=reddit_df,
                           target_column='body', 
                           stop_words=stop_words,
                           station=station, 
                           photo=photo, 
                           nlp=nlp,
                           jobs=jobs)

In [None]:
start_time = time.time()

processed_list=mp_instance.excecute()

end_time = time.time() - start_time
print("{0} tokenized in {1} sec with {2} threads".format(reddit_df.shape[0], end_time, jobs))

In [None]:
text_data = [i[1] for i in processed_list]

In [None]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [None]:
def compute_coherence_values(parameters,dictionary, corpus, texts):
    
    global best_score
    global best_model
    
    parameters['num_topics']=int(parameters['num_topics'])
    parameters['passes']=int(parameters['passes'])
    
    model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, random_state=400, **parameters)
    coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score=coherencemodel.get_coherence()
    
    if coherence_score > best_score:
        best_model=model
        best_score=coherence_score

    return coherence_score

def bayesian_optimizer(parameters, dictionary, corpus, texts):
    global best_score
    coherence_values = -compute_coherence_values(parameters, dictionary, corpus, texts)
    if -coherence_values > best_score:
        best_score = -coherence_values
    return {'loss': coherence_values, 'status': STATUS_OK}

In [None]:
# 'distributed': hp.choice('distributed', [True, False])
# 'chunksize': hp.quniform('chunksize', 10000, 5000, 100000)
# 'gamma_threshold':hp.loguniform('gamma_threshold', -3, 2),
# 'minimum_phi_value':hp.loguniform('minimum_phi_value', -3, 2),

trials = Trials()
max_evals=100

best_score=0
best_model=0

parameters ={'num_topics':hp.quniform('num_topics', 1, 50, 1),
             'passes': hp.quniform('passes', 3, 30, 1),
             'decay':hp.uniform('decay', 0.5, 1),
             'alpha': hp.choice('alpha', ["asymmetric", "auto"])
            }
best = fmin(lambda x: bayesian_optimizer(parameters=x,dictionary=dictionary, corpus=corpus, texts=text_data), 
            parameters, 
            algo=tpe.suggest, 
            max_evals=max_evals, 
            trials=trials)

In [None]:
losses = [trials.trials[i]['result']['loss'] for i in range(len(trials.trials))]
params = pd.DataFrame(trials.vals)
params['loss'] = losses
params.sort_values('loss', inplace=True)
params.head(10)

In [None]:
params.to_csv("hyper_parameters.csv",index=False)

In [None]:
topics = best_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
lda = gensim.models.ldamodel.LdaModel.load('..\Models\model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
#ldamodel.save('..\Models\model5.gensim')
#pyLDAvis.save_html(lda_display, '..\Visualisations\5 topics.html')