In [1]:
import pandas as pd
import spacy
from spacy.lang.en import English
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk
import re
import gensim
from gensim import corpora
import pickle
from collections import OrderedDict
import pyLDAvis.gensim
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

nlp = spacy.load('en_core_web_sm')
lemmatizer=WordNetLemmatizer()

### Load updated stop words list
stop_words = pd.read_csv(r'..\Data\stop_words.csv')
stop_words = set(stop_words['stop_words']) 

### Load station names list
station_names = pd.read_csv(r'..\Data\station_names.csv')
station = re.compile('|'.join(map(re.escape, station_names['Station'].str.lower())))

photo_names = ['svg','png','jpeg','jpg', 'photo','pictures','picture','photos']
photo = re.compile('|'.join(map(re.escape, photo_names)))

### ================================ Function ================================

def flatten(x):
    """
    Function to flatten out nested list
    
    Parameters:
    ----------
    x : nested list
    
    Return:
    ----------
    [list elements removed from nested list]
    """
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result


def get_keywords(text):
    """
    Function to extract chunks of key nouns and verbs
    
    Parameters:
    ----------
    text : comment string
    
    Return:
    ----------
    [list of unigram keywords ]
    """
    main_phrases = []
    for chunk in text.noun_chunks:
        if chunk.root.dep_ == 'nsubj' or chunk.root.dep_ == 'dobj' or chunk.root.dep_ == 'pobj': 
            main_phrases.append(chunk.lemma_)
    for word in text:
        if word.pos_ == 'VERB':
            main_phrases.append(word.lemma_)
    final_phrases = flatten([i.split(' ') for i in main_phrases])
    return [w for w in final_phrases if w not in stop_words and '-PRON-' not in w]

def tokenize(text):
    """
    Function to pre-process string 
    
    Parameters:
    ----------
    text : comment string
    Return:
    ----------
    [processed string, [list of keywords]]
    """
    ### 1. Masking common strings
    if 'https://' in text:
        text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 'urllink', text, flags=re.MULTILINE)
    processed_text = re.sub('[^A-Za-z]+', ' ', text).lower()
    processed_text = station.sub("ttcstation", processed_text)
    processed_text = photo.sub("photo", processed_text)
    ### 2. Get Lemma and conduct POS tagging
    input_str=nlp(processed_text)
    lemma_str = [token.lemma_ for token in input_str]
    filtered_str = ' '.join([w for w in lemma_str if not w in stop_words])
    return [filtered_str, get_keywords(input_str)]
           

ModuleNotFoundError: No module named 'spacy'

In [2]:
reddit_df = pd.read_csv(r'..\Data\reddit_data_raw.csv')
reddit_df.columns

Index(['title', 'score', 'id', 'url', 'comms_num', 'created', 'body',
       'subreddit', 'Presto_label', 'keywords'],
      dtype='object')

In [3]:
processed_list = reddit_df['body'].apply(lambda x: tokenize(x))

text_data = [i[1] for i in processed_list]


In [35]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=15)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=text_data, start=2, limit=40, step=6)

# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [20]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=text_data, start=2, limit=40, step=6)

# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

KeyboardInterrupt: 

In [37]:
### 
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.028*"urllink" + 0.014*"gas" + 0.009*"stations" + 0.008*"school" + 0.006*"ttcstation" + 0.006*"stickers" + 0.005*"kids" + 0.004*"food" + 0.004*"students" + 0.004*"free"')
(1, '0.033*"transit" + 0.023*"people" + 0.014*"public" + 0.012*"city" + 0.010*"toronto" + 0.007*"money" + 0.006*"tax" + 0.006*"think" + 0.006*"cost" + 0.006*"pay"')
(2, '0.034*"ttc" + 0.016*"presto" + 0.013*"people" + 0.013*"fare" + 0.009*"card" + 0.007*"police" + 0.006*"go" + 0.006*"think" + 0.005*"pay" + 0.005*"person"')
(3, '0.044*"ttcstation" + 0.026*"subway" + 0.016*"go" + 0.014*"ttc" + 0.012*"people" + 0.011*"bus" + 0.010*"train" + 0.008*"stations" + 0.007*"transit" + 0.007*"going"')
(4, '0.023*"transit" + 0.013*"toronto" + 0.013*"ford" + 0.012*"subway" + 0.012*"city" + 0.011*"government" + 0.011*"plan" + 0.010*"ontario" + 0.008*"province" + 0.006*"new"')


In [38]:
lda = gensim.models.ldamodel.LdaModel.load('..\Models\model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [44]:
#ldamodel.save('..\Models\model5.gensim')
#pyLDAvis.save_html(lda_display, '..\Visualisations\5 topics.html')