In [1]:
#Basic Libraries
import re
import numpy as np
import pandas as pd
from pprint import pprint
import sys
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from topic_model_function import *



In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
num_topics=17
#Load Dataset
df=pd.read_csv('/home/smriti/Smriti/MITACS/Anxiety/Data/CSV/Reddit/rAnxiety16.csv')
#getting rid of NaN
df=df.replace(np.nan, '', regex=True)
#getting rid of deleted values
df['Text']=df['Text'].replace('[deleted]','')
#Combining title and text
df["Post"] = df["Title"] + df["Text"]
#Now that we don't need Title or Text, we drop those columns before saving the file
df=df.drop(['Title', 'Text'], axis = 1)

In [4]:
df=df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.head()

Unnamed: 0,Number of Comments,Date Posted,Post
0,53,2016-01-06 20:10:45 EST-0500,You know you have social anxiety when you say ...
1,4,2016-01-01 02:22:08 EST-0500,"15 TED talks on anxiety, fear, and mental well..."
2,89,2016-01-02 02:24:49 EST-0500,Anyone else strangely calm in actual emergenci...
3,16,2016-01-03 07:58:52 EST-0500,"Fuck social anxiety, this is going to be the y..."
4,36,2016-01-02 19:38:52 EST-0500,DAE feel like they are constantly feeling watc...


In [6]:
# Convert to list
data=df.Post.values.tolist()
# Remove new line characters
data=[re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data=[re.sub("\'", "", sent) for sent in data]

In [7]:
#Function to clean up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

In [8]:
#Clean up text
data_words=list(sent_to_words(data))
print(data_words[:1])

[['you', 'know', 'you', 'have', 'social', 'anxiety', 'when', 'you', 'say', 'hope', 'did', 'ok', 'after', 'youve', 'had', 'conversation', 'with', 'someone']]


In [9]:
# Build the bigram and trigram models
bigram=gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram=gensim.models.Phrases(bigram[data_words], threshold=100)

In [10]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod=gensim.models.phrases.Phraser(bigram)
trigram_mod=gensim.models.phrases.Phraser(trigram)

In [11]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['you', 'know', 'you', 'have', 'social', 'anxiety', 'when', 'you', 'say', 'hope', 'did', 'ok', 'after', 'youve', 'had', 'conversation', 'with', 'someone']


In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


In [13]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
#----- CHANGED ------#

data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")

# 1. Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

# 2. Create Dictionary needed for topic modelling
id2word = corpora.Dictionary(data_lemmatized)

# 3. Create Corpus
texts = data_lemmatized

# 4. Term Document Frequency and Create a bag of words
bow_corpus = bow(dictionary=id2word, processed_docs=texts)

# 5. Calculate low_tfidf_words
# Keep only words with tfidf ranking <= x * len(dictionary)
x = 0.2
total_word_count, DictDocFreq = tf_df(bow_corpus, id2word)
sorted_TFIDF = sort_tfidf(bow_corpus, total_word_count, DictDocFreq)
low_tfidf_words = get_low_tfidf_words(x, id2word, sorted_TFIDF)

# 6. Filter out least frequently used words
no_below = 0.01
keep_n = 10000
dict_least_freq_filtered = filter_least_frequent(id2word, texts, 
                                                 no_below, keep_n)

# 7. Filter out most commonly used words (i.e. words with low TF-IDF score)
dict_tfidf_filtered = filter_most_common(dict_least_freq_filtered, low_tfidf_words)

# 8. Create the second bag of words - bow_corpus_TFIDFfiltered, 
# created after least frequently and most commonly used words were filtered out.
corpus = bow(dict_tfidf_filtered, texts)

# View
[[(dict_tfidf_filtered[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dict_tfidf_filtered,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

[['know', 'social', 'anxiety', 'say', 'hope', 've', 'conversation']]
There are 842 unique words in the dictionary, 842 remain after filtering out lest frequent.
842 remain after filtering out most commonly used words based on tfidf scores.


In [15]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.052*"go" + 0.028*"m" + 0.023*"day" + 0.022*"life" + 0.018*"think" + '
  '0.018*"make" + 0.016*"year" + 0.015*"time" + 0.014*"thing" + 0.012*"get"'),
 (1,
  '0.098*"happy" + 0.073*"open" + 0.065*"movie" + 0.065*"door" + 0.059*"love" '
  '+ 0.048*"light" + 0.046*"name" + 0.038*"true" + 0.038*"draw" + '
  '0.035*"allow"'),
 (2,
  '0.034*"work" + 0.033*"week" + 0.027*"help" + 0.027*"take" + 0.024*"doctor" '
  '+ 0.021*"change" + 0.020*"therapy" + 0.017*"therapist" + 0.017*"experience" '
  '+ 0.017*"medication"'),
 (3,
  '0.065*"grow" + 0.061*"away" + 0.059*"family" + 0.058*"man" + '
  '0.055*"outside" + 0.054*"early" + 0.048*"move" + 0.039*"scared" + '
  '0.037*"stay" + 0.036*"parent"'),
 (4,
  '0.058*"feel" + 0.047*"people" + 0.033*"make" + 0.031*"know" + 0.027*"want" '
  '+ 0.026*"even" + 0.026*"always" + 0.025*"m" + 0.023*"think" + 0.022*"talk"'),
 (5,
  '0.177*"exercise" + 0.170*"meditation" + 0.097*"alcohol" + 0.088*"great" + '
  '0.065*"intrusive_thought" + 0.058*"self" + 0

In [16]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -6.698158400337519


In [17]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.33735637119195017


In [18]:
topic_words = []
for i in range(num_topics):
    tt = lda_model.get_topic_terms(i,10)
    topic_words.append([id2word[pair[0]] for pair in tt])

In [19]:
for i in range(0,num_topics):
    print(topic_words[i])

['go', 'm', 'day', 'life', 'think', 'make', 'year', 'time', 'thing', 'get']
['happy', 'open', 'movie', 'door', 'love', 'light', 'name', 'true', 'draw', 'allow']
['work', 'week', 'help', 'take', 'doctor', 'change', 'therapy', 'therapist', 'experience', 'medication']
['grow', 'away', 'family', 'man', 'outside', 'early', 'move', 'scared', 'stay', 'parent']
['feel', 'people', 'make', 'know', 'want', 'even', 'always', 'm', 'think', 'talk']
['exercise', 'meditation', 'alcohol', 'great', 'intrusive_thought', 'self', 'important', 'pm', 'day', 'strong']
['sleep', 'hour', 'night', 'day', 'wake', 'bed', 'work', 'week', 'car', 'yesterday']
['anyone_else', 'rather', 'loud', 'stand', 'phone', 'music', 'mess', 'noise', 'remove', 'animal']
['job', 'work', 'money', 'due', 'pay', 'decision', 'full', 'interact', 'interaction', 'parent']
['fight', 'breathe', 'breath', 'respond', 'breathing', 'destroy', 'control', 'catch', 'lack', 'race']
['thought', 'laugh', 'thinking', 'negative', 'eat', 'plan', 'present

In [20]:
df=pd.DataFrame({'Year':[],'Source':[],'Topic_ID':[],'Most_freq_words':[]})
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words


In [21]:
df['Most_freq_words']=topic_words

In [22]:
df = df.assign(Year='2016')
df = df.assign(Source='Reddit')

In [23]:
ls=[]
for i in range(0,num_topics):
    ls.append(i)
df['Topic_ID']=ls

In [24]:
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words
0,2016,Reddit,0,"[go, m, day, life, think, make, year, time, th..."
1,2016,Reddit,1,"[happy, open, movie, door, love, light, name, ..."
2,2016,Reddit,2,"[work, week, help, take, doctor, change, thera..."
3,2016,Reddit,3,"[grow, away, family, man, outside, early, move..."
4,2016,Reddit,4,"[feel, people, make, know, want, even, always,..."


In [25]:
df.to_csv("topic_words_r2016.csv")