In [1]:
#Basic Libraries
import re
import numpy as np
import pandas as pd
from pprint import pprint
import sys
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from topic_model_function import *



In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
num_topics=35
#Load Dataset
df=pd.read_csv('/home/smriti/Smriti/MITACS/Anxiety/Data/CSV/Reddit/rAnxiety20.csv')
#getting rid of NaN
df=df.replace(np.nan, '', regex=True)
#getting rid of deleted values
df['Text']=df['Text'].replace('[deleted]','')
#Combining title and text
df["Post"] = df["Title"] + df["Text"]
#Now that we don't need Title or Text, we drop those columns before saving the file
df=df.drop(['Title', 'Text'], axis = 1)

In [4]:
df=df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.head()

Unnamed: 0,Number of Comments,Date Posted,Post
0,55,2020-04-15 15:12:25 EDT-0400,When you write a freaking essay on here and th...
1,95,2020-04-16 09:38:14 EDT-0400,I wish I knew what it was like to not have anx...
2,62,2020-04-16 01:41:14 EDT-0400,Probably the most exhausting part about anxiet...
3,161,2020-04-16 21:48:01 EDT-0400,Has anyone else ever exerienced a period where...
4,122,2020-04-15 06:34:07 EDT-0400,I fucking love ProzacIt calms me down sooo muc...


In [6]:
# Convert to list
data=df.Post.values.tolist()
# Remove new line characters
data=[re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data=[re.sub("\'", "", sent) for sent in data]

In [7]:
#Function to clean up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

In [8]:
#Clean up text
data_words=list(sent_to_words(data))
print(data_words[:1])

[['when', 'you', 'write', 'freaking', 'essay', 'on', 'here', 'and', 'then', 'delete', 'it', 'because', 'you', 'know', 'anxiety']]


In [9]:
# Build the bigram and trigram models
bigram=gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram=gensim.models.Phrases(bigram[data_words], threshold=100)

In [10]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod=gensim.models.phrases.Phraser(bigram)
trigram_mod=gensim.models.phrases.Phraser(trigram)

In [11]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['when', 'you', 'write', 'freaking', 'essay', 'on', 'here', 'and', 'then', 'delete', 'it', 'because', 'you', 'know', 'anxiety']


In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


In [13]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
#----- CHANGED ------#

data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
!python3 -m spacy download en
nlp = spacy.load('en')

# 1. Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

# 2. Create Dictionary needed for topic modelling
id2word = corpora.Dictionary(data_lemmatized)

# 3. Create Corpus
texts = data_lemmatized

# 4. Term Document Frequency and Create a bag of words
bow_corpus = bow(dictionary=id2word, processed_docs=texts)

# 5. Calculate low_tfidf_words
# Keep only words with tfidf ranking <= x * len(dictionary)
x = 0.2
total_word_count, DictDocFreq = tf_df(bow_corpus, id2word)
sorted_TFIDF = sort_tfidf(bow_corpus, total_word_count, DictDocFreq)
low_tfidf_words = get_low_tfidf_words(x, id2word, sorted_TFIDF)

# 6. Filter out least frequently used words
no_below = 0.01
keep_n = 10000
dict_least_freq_filtered = filter_least_frequent(id2word, texts, 
                                                 no_below, keep_n)

# 7. Filter out most commonly used words (i.e. words with low TF-IDF score)
dict_tfidf_filtered = filter_most_common(dict_least_freq_filtered, low_tfidf_words)

# 8. Create the second bag of words - bow_corpus_TFIDFfiltered, 
# created after least frequently and most commonly used words were filtered out.
corpus = bow(dict_tfidf_filtered, texts)

# View
[[(dict_tfidf_filtered[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dict_tfidf_filtered,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Collecting en_core_web_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 10.7 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.1.0-py3-none-any.whl size=11074433 sha256=eec185cf31a95f0a46c7f219f209ab0c3f4ca72ec763f13254112993804e9c4e
  Stored in directory: /tmp/pip-ephem-wheel-cache-bdj_vf11/wheels/83/63/50/6467284e1c2d50ffcf02e6582680fd16c6375748a94f75c1a0
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.0.0
    Uninstalling en-core-web-sm-3.0.0:
      Successfully uninstalled en-core-web-sm-3.0.0
Successfully installed en-core-web-sm-2.1.0
[38;5;2m✔ Download and insta

In [18]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(24,
  '0.001*"annoy" + 0.001*"guess" + 0.001*"meet" + 0.001*"old" + 0.001*"panic" '
  '+ 0.001*"stand" + 0.001*"terrify" + 0.001*"treat" + 0.001*"reality" + '
  '0.001*"anymore"'),
 (11,
  '0.001*"annoy" + 0.001*"guess" + 0.001*"meet" + 0.001*"old" + 0.001*"panic" '
  '+ 0.001*"stand" + 0.001*"terrify" + 0.001*"treat" + 0.001*"reality" + '
  '0.001*"anymore"'),
 (10,
  '0.433*"write" + 0.326*"freak" + 0.085*"delete" + 0.000*"know" + '
  '0.000*"terrify" + 0.000*"old" + 0.000*"reality" + 0.000*"treat" + '
  '0.000*"extra" + 0.000*"panic"'),
 (29,
  '0.258*"leave" + 0.189*"husband" + 0.159*"offer" + 0.137*"nee" + '
  '0.102*"part" + 0.000*"go" + 0.000*"treat" + 0.000*"stand" + 0.000*"reality" '
  '+ 0.000*"lack"'),
 (33,
  '0.334*"test" + 0.300*"covid" + 0.196*"remove" + 0.039*"positive" + '
  '0.000*"parent" + 0.000*"reality" + 0.000*"annoy" + 0.000*"extra" + '
  '0.000*"stand" + 0.000*"treat"'),
 (18,
  '0.231*"close" + 0.214*"constant" + 0.095*"family" + 0.091*"question" + '
  '0.07

In [19]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.6688319699720005


In [20]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3004801663714596


In [21]:
topic_words = []
for i in range(num_topics):
    tt = lda_model.get_topic_terms(i,10)
    topic_words.append([id2word[pair[0]] for pair in tt])

In [22]:
for i in range(0,num_topics):
    print(topic_words[i])

['year', 'house', 'anymore', 'old', 'panic_attack', 'panic', 'reach', 'disorder', 'ago', 'grow']
['anxious', 'afraid', 'reason', 'anyone_else', 'health', 'new', 'cause', 'message', 'cycle', 'problem']
['brain', 'body', 'right', 'lately', 'severe', 'guess', 'extra', 'panic_attack', 'suddenly', 'hard']
['call', 'something', 'constantly', 'happen', 'wrong', 'phone', 'worried', 'get', 'say', 'sometimes']
['anyone', 'experience', 'someone', 'advice', 'super', 'see', 'nervous', 'sound', 'least', 'shake']
['talk', 'stop', 'support', 'think', 'wanna', 'hate', 'literally', 'pretty', 'nothing', 'far']
['finally', 'week', 'put', 'hour', 'take', 'day', 'today', 'home', 'start', 'panic_attack']
['sleep', 'eat', 'night', 'morning', 'tomorrow', 'terrible', 'vent', 'sick', 'allow', 'today']
['post', 'high', 'side', 'nausea', 'anyone', 'pandemic', 'struggle', 'helpful', 'pill', 'graduate']
['attack', 'thinking', 'arm', 'throat', 'heart_rate', 'tight', 'usually', 'chance', 'love', 'increase']
['write', 

In [23]:
df=pd.DataFrame({'Year':[],'Source':[],'Topic_ID':[],'Most_freq_words':[]})
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words


In [24]:
df['Most_freq_words']=topic_words

In [25]:
df = df.assign(Year='2020')
df = df.assign(Source='Reddit')

In [26]:
ls=[]
for i in range(0,num_topics):
    ls.append(i)
df['Topic_ID']=ls

In [27]:
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words
0,2020,Reddit,0,"[year, house, anymore, old, panic_attack, pani..."
1,2020,Reddit,1,"[anxious, afraid, reason, anyone_else, health,..."
2,2020,Reddit,2,"[brain, body, right, lately, severe, guess, ex..."
3,2020,Reddit,3,"[call, something, constantly, happen, wrong, p..."
4,2020,Reddit,4,"[anyone, experience, someone, advice, super, s..."


In [28]:
df.to_csv("topic_words_r2020.csv")