### Obtained Perplexity: -7.07, Coherence: 0.49, Best Number of Topics= 45

In [1]:
#Basic Libraries
import re
import numpy as np
import pandas as pd
from pprint import pprint
import sys
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from topic_model_function import *



In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
num_topics=26
year='2019'
#Load Dataset
df=pd.read_csv('/home/smriti/Smriti/MITACS/Anxiety/Data/CSV/Medium/medium_text_2019.csv')
#getting rid of NaN
df=df.replace(np.nan, '', regex=True)
#Combining title and text
df["Post"] = df["title"] + df["text"]
#Now that we don't need Title or Text, we drop those columns before saving the file
df=df.drop(['title', 'text'], axis = 1)

In [4]:
df=df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.head()

Unnamed: 0,year,Post
0,2019,"My Year Without BoozeDrinking was my routine, ..."
1,2019,Reflections From the Bathroom FloorAs I lay in...
2,2019,OCD Medications Work (But You’ll Have to Try a...
3,2019,"Anxiety, a fight of two loversAnxiety is the i..."
4,2019,The Old Torture Chamber Of The HeartTo dismant...


In [6]:
# Convert to list
data=df.Post.values.tolist()
# Remove new line characters
data=[re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data=[re.sub("\'", "", sent) for sent in data]

In [7]:
#Function to clean up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

In [8]:
#Clean up text
data_words=list(sent_to_words(data))
print(data_words[:1])

[['my', 'year', 'without', 'boozedrinking', 'was', 'my', 'routine', 'booze', 'was', 'my', 'pattern', 'red', 'wine', 'while', 'cooked', 'dinner', 'whiskey', 'while', 'got', 'ready', 'to', 'go', 'out', 'more', 'wine', 'for', 'an', 'afternoon', 'in', 'the', 'park', 'every', 'occasion', 'revolved', 'around', 'drinking', 'felt', 'fatigued', 'around', 'the', 'holidays', 'many', 'peers', 'started', 'talking', 'about', 'doing', 'sober', 'january', 'the', 'first', 'thought', 'in', 'my', 'head', 'was', 'wow', 'that', 'sounds', 'really', 'hard', 'that', 'thought', 'led', 'to', 'the', 'realization', 'that', 'if', 'it', 'seemed', 'hard', 'maybe', 'it', 'was', 'necessary', 'this', 'time', 'last', 'year', 'decided', 'to', 'attempt', 'sober', 'january', 'as', 'january', 'started', 'yoga', 'classes', 'replaced', 'the', 'hours', 'would', 'have', 'spent', 'drinking', 'with', 'friends', 'spent', 'my', 'th', 'birthday', 'on', 'saturday', 'january', 'th', 'organizing', 'my', 'kitchen', 'and', 'listening', '

In [9]:
# Build the bigram and trigram models
bigram=gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram=gensim.models.Phrases(bigram[data_words], threshold=100)

In [10]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod=gensim.models.phrases.Phraser(bigram)
trigram_mod=gensim.models.phrases.Phraser(trigram)

In [11]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['my', 'year', 'without', 'boozedrinking', 'was', 'my', 'routine', 'booze', 'was', 'my', 'pattern', 'red_wine', 'while', 'cooked', 'dinner', 'whiskey', 'while', 'got', 'ready', 'to', 'go', 'out', 'more', 'wine', 'for', 'an', 'afternoon', 'in', 'the', 'park', 'every', 'occasion', 'revolved_around', 'drinking', 'felt', 'fatigued', 'around', 'the', 'holidays', 'many', 'peers', 'started', 'talking', 'about', 'doing', 'sober', 'january', 'the', 'first', 'thought', 'in', 'my', 'head', 'was', 'wow', 'that', 'sounds', 'really', 'hard', 'that', 'thought', 'led', 'to', 'the', 'realization', 'that', 'if', 'it', 'seemed', 'hard', 'maybe', 'it', 'was', 'necessary', 'this', 'time', 'last', 'year', 'decided', 'to', 'attempt', 'sober', 'january', 'as', 'january', 'started', 'yoga', 'classes', 'replaced', 'the', 'hours', 'would', 'have', 'spent', 'drinking', 'with', 'friends', 'spent', 'my', 'th_birthday', 'on', 'saturday', 'january_th', 'organizing', 'my', 'kitchen', 'and', 'listening', 'to', 'podcast

In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


In [13]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
#----- CHANGED ------#

data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")

# 1. Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

# 2. Create Dictionary needed for topic modelling
id2word = corpora.Dictionary(data_lemmatized)

# 3. Create Corpus
texts = data_lemmatized

# 4. Term Document Frequency and Create a bag of words
bow_corpus = bow(dictionary=id2word, processed_docs=texts)

# 5. Calculate low_tfidf_words
# Keep only words with tfidf ranking <= x * len(dictionary)
x = 0.2
total_word_count, DictDocFreq = tf_df(bow_corpus, id2word)
sorted_TFIDF = sort_tfidf(bow_corpus, total_word_count, DictDocFreq)
low_tfidf_words = get_low_tfidf_words(x, id2word, sorted_TFIDF)

# 6. Filter out least frequently used words
no_below = 0.01
keep_n = 10000
dict_least_freq_filtered = filter_least_frequent(id2word, texts, 
                                                 no_below, keep_n)

# 7. Filter out most commonly used words (i.e. words with low TF-IDF score)
dict_tfidf_filtered = filter_most_common(dict_least_freq_filtered, low_tfidf_words)

# 8. Create the second bag of words - bow_corpus_TFIDFfiltered, 
# created after least frequently and most commonly used words were filtered out.
corpus = bow(dict_tfidf_filtered, texts)

# View
[[(dict_tfidf_filtered[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dict_tfidf_filtered,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

[['year', 'boozedrinke', 'routine', 'booze', 'pattern', 'cook', 'dinner', 'whiskey', 'get', 'ready', 'go', 'wine', 'afternoon', 'park', 'occasion', 'revolved_around', 'drinking', 'feel', 'fatigued', 'holiday', 'many', 'peer', 'start', 'talk', 'sober', 'january', 'first', 'think', 'head', 'sound', 'really', 'hard', 'thought', 'lead', 'realization', 'seem', 'hard', 'maybe', 'necessary', 'time', 'last', 'year', 'decide', 'attempt', 'sober', 'january', 'january', 'start', 'yoga', 'class', 'replace', 'hour', 'spend', 'drink', 'friend', 'spend', 'th_birthday', 'saturday', 'organizing', 'kitchen', 'listening', 'podcast', 'feel', 'draw', 'philosophical', 'content', 'krista', 'tippett', 'get', 'enough', 'social', 'justice', 'news', 'pod', 'save', 'people', 'deray', 'mckesson', 'read', 'book', 'even', 'voraciously', 'favorite', 'start', 'sing', 'unburied', 'sing', 'jesmyn', 'ward', 'red', 'clock', 'leni', 'zumas', 'immortalist', 'chloe', 'benjamin', 'see', 'complete', 'reading', 'list', 'book', 

In [15]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(25,
  '0.174*"god" + 0.072*"faith" + 0.045*"plane" + 0.037*"pray" + 0.035*"peace" '
  '+ 0.035*"church" + 0.033*"prayer" + 0.028*"fly" + 0.028*"lord" + '
  '0.024*"christian"'),
 (11,
  '0.186*"dog" + 0.149*"job" + 0.078*"cat" + 0.062*"pet" + 0.056*"interview" + '
  '0.041*"animal" + 0.037*"company" + 0.029*"america" + 0.028*"owner" + '
  '0.027*"employee"'),
 (21,
  '0.081*"therapy" + 0.079*"therapist" + 0.075*"student" + 0.060*"school" + '
  '0.050*"class" + 0.039*"college" + 0.026*"university" + 0.026*"session" + '
  '0.026*"teacher" + 0.023*"exam"'),
 (23,
  '0.146*"disorder" + 0.094*"symptom" + 0.039*"social" + 0.023*"treatment" + '
  '0.023*"cause" + 0.022*"person" + 0.021*"experience" + 0.020*"suffer" + '
  '0.019*"common" + 0.017*"condition"'),
 (14,
  '0.086*"medication" + 0.073*"doctor" + 0.051*"patient" + 0.047*"treatment" + '
  '0.038*"drug" + 0.028*"therapy" + 0.025*"pill" + 0.025*"treat" + '
  '0.024*"pain" + 0.023*"medical"'),
 (5,
  '0.055*"book" + 0.039*"goal" + 0.03

In [16]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.810747149991645


In [17]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4765324078666899


In [18]:
topic_words = []
for i in range(num_topics):
    tt = lda_model.get_topic_terms(i,10)
    topic_words.append([id2word[pair[0]] for pair in tt])

In [19]:
for i in range(0,num_topics):
    print(topic_words[i])

['feeling', 'self', 'control', 'situation', 'learn', 'change', 'thought', 'anxious', 'happen', 'become']
['fear', 'panic_attack', 'panic', 'attack', 'brain', 'trigger', 'ocd', 'afraid', 'threat', 'danger']
['brain', 'study', 'increase', 'effect', 'research', 'show', 'also', 'technology', 'use', 'report']
['start', 'new', 'change', 'keep', 'year', 'list', 'give', 'many', 'spend', 'also']
['child', 'kid', 'parent', 'school', 'family', 'mother', 'young', 'mom', 'year', 'old']
['book', 'goal', 'business', 'decision', 'failure', 'value', 'coach', 'career', 'money', 'success']
['experience', 'also', 'many', 'however', 'often', 'state', 'form', 'issue', 'example', 'follow']
['love', 'relationship', 'self', 'woman', 'care', 'man', 'trauma', 'pain', 'partner', 'emotional']
['look', 'walk', 'room', 'back', 'turn', 'smile', 'eye', 'leave', 'sit', 'light']
['com', 'social_media', 'post', 'online', 'instagram', 'app', 'video', 'phone', 'facebook', 'content']
['stress', 'body', 'exercise', 'physical

In [20]:
df=pd.DataFrame({'Year':[],'Source':[],'Topic_ID':[],'Most_freq_words':[]})
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words


In [21]:
df['Most_freq_words']=topic_words

In [22]:
df = df.assign(Year=year)
df = df.assign(Source='Medium')
df.shape

(26, 4)

In [23]:
ls=[]
for i in range(0,num_topics):
    ls.append(i)
df['Topic_ID']=ls

In [24]:
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words
0,2019,Medium,0,"[feeling, self, control, situation, learn, cha..."
1,2019,Medium,1,"[fear, panic_attack, panic, attack, brain, tri..."
2,2019,Medium,2,"[brain, study, increase, effect, research, sho..."
3,2019,Medium,3,"[start, new, change, keep, year, list, give, m..."
4,2019,Medium,4,"[child, kid, parent, school, family, mother, y..."


In [25]:
df.to_csv("topic_words_m2019.csv")