In [1]:
#Basic Libraries
import re
import numpy as np
import pandas as pd
from pprint import pprint
import sys
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from topic_model_function import *



In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
#Load Dataset
df=pd.read_csv('/home/smriti/Smriti/MITACS/Anxiety/Data/CSV/Academic/Acad_2020.csv')
#getting rid of NaN
df=df.replace(np.nan, '', regex=True)
#Combining title and text
df["Text"] = df["Title"] + df["Abstract"]
#Now that we don't need Title or Text, we drop those columns before saving the file
df=df.drop(['Title', 'Abstract'], axis = 1)

In [4]:
df=df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [5]:
df.head()

Unnamed: 0,StoreId,documentType,year,Text
0,2445966824.0,Journal Article,2020,Threat rapidly disrupts reward reversal learni...
1,2445966027.0,Journal Article,2020,Fear in the context of pain: Lessons learned f...
2,2445965242.0,Journal Article,2020,The effects of positive interpretation bias on...
3,2435222091.0,"Evidence Based Healthcare , Journal Article",2020,Implementation and effectiveness of adolescent...
4,2445966078.0,Journal Article,2020,The effects of age and trait anxiety on avoida...


In [6]:
# Convert to list
data=df.Text.values.tolist()
# Remove new line characters
data=[re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data=[re.sub("\'", "", sent) for sent in data]

In [7]:
#Function to clean up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

In [8]:
#Clean up text
data_words=list(sent_to_words(data))
print(data_words[:1])

[['threat', 'rapidly', 'disrupts', 'reward', 'reversal', 'learningthreat', 'changes', 'cognition', 'and', 'facilitates', 'adaptive', 'coping', 'however', 'when', 'threat', 'becomes', 'overwhelming', 'it', 'may', 'be', 'deleterious', 'to', 'mental', 'health', 'especially', 'for', 'vulnerable', 'individuals', 'flexible', 'decision', 'making', 'was', 'probed', 'with', 'reward', 'reversal', 'task', 'to', 'investigate', 'how', 'well', 'healthy', 'participants', 'can', 'adapt', 'to', 'changes', 'in', 'reward', 'contingency', 'when', 'they', 'expect', 'adverse', 'events', 'electric', 'shocks', 'in', 'comparison', 'to', 'safe', 'control', 'condition', 'the', 'threat', 'of', 'shock', 'significantly', 'impaired', 'reward', 'reversal', 'learning', 'moreover', 'enhanced', 'self', 'reported', 'threat', 'ratings', 'and', 'elevated', 'skin', 'conductance', 'levels', 'support', 'the', 'successful', 'induction', 'of', 'stressful', 'and', 'aversive', 'apprehensions', 'the', 'findings', 'are', 'in', 'lin

In [9]:
# Build the bigram and trigram models
bigram=gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram=gensim.models.Phrases(bigram[data_words], threshold=100)

In [10]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod=gensim.models.phrases.Phraser(bigram)
trigram_mod=gensim.models.phrases.Phraser(trigram)

In [11]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['threat', 'rapidly', 'disrupts', 'reward', 'reversal', 'learningthreat', 'changes', 'cognition', 'and', 'facilitates', 'adaptive', 'coping', 'however', 'when', 'threat', 'becomes', 'overwhelming', 'it', 'may', 'be', 'deleterious', 'to', 'mental', 'health', 'especially', 'for', 'vulnerable', 'individuals', 'flexible', 'decision_making', 'was', 'probed', 'with', 'reward', 'reversal', 'task', 'to', 'investigate', 'how', 'well', 'healthy', 'participants', 'can', 'adapt', 'to', 'changes', 'in', 'reward', 'contingency', 'when', 'they', 'expect', 'adverse_events', 'electric', 'shocks', 'in', 'comparison', 'to', 'safe', 'control', 'condition', 'the', 'threat', 'of', 'shock', 'significantly', 'impaired', 'reward', 'reversal', 'learning', 'moreover', 'enhanced', 'self', 'reported', 'threat', 'ratings', 'and', 'elevated', 'skin_conductance', 'levels', 'support', 'the', 'successful', 'induction', 'of', 'stressful', 'and', 'aversive', 'apprehensions', 'the', 'findings', 'are', 'in', 'line', 'with'

In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


In [13]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
#! python3 -m spacy download en_core_web_sm

In [15]:
#----- CHANGED ------#

data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")

# 1. Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

# 2. Create Dictionary needed for topic modelling
id2word = corpora.Dictionary(data_lemmatized)

# 3. Create Corpus
texts = data_lemmatized

# 4. Term Document Frequency and Create a bag of words
bow_corpus = bow(dictionary=id2word, processed_docs=texts)

# 5. Calculate low_tfidf_words
# Keep only words with tfidf ranking <= x * len(dictionary)
x = 0.2
total_word_count, DictDocFreq = tf_df(bow_corpus, id2word)
sorted_TFIDF = sort_tfidf(bow_corpus, total_word_count, DictDocFreq)
low_tfidf_words = get_low_tfidf_words(x, id2word, sorted_TFIDF)

# 6. Filter out least frequently used words
no_below = 0.01
keep_n = 10000
dict_least_freq_filtered = filter_least_frequent(id2word, texts, 
                                                 no_below, keep_n)

# 7. Filter out most commonly used words (i.e. words with low TF-IDF score)
dict_tfidf_filtered = filter_most_common(dict_least_freq_filtered, low_tfidf_words)

# 8. Create the second bag of words - bow_corpus_TFIDFfiltered, 
# created after least frequently and most commonly used words were filtered out.
corpus = bow(dict_tfidf_filtered, texts)

# View
[[(dict_tfidf_filtered[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dict_tfidf_filtered,
                                           num_topics=18,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

[['threat', 'rapidly', 'disrupt', 'reward', 'reversal', 'learningthreat', 'change', 'cognition', 'facilitate', 'adaptive', 'cope', 'however', 'threat', 'become', 'overwhelming', 'deleterious', 'mental', 'health', 'especially', 'vulnerable', 'individual', 'flexible', 'decision_making', 'probe', 'reward', 'reversal', 'task', 'investigate', 'well', 'healthy', 'participant', 'adapt', 'change', 'reward', 'contingency', 'expect', 'adverse_event', 'electric', 'shock', 'comparison', 'safe', 'control', 'condition', 'threat', 'shock', 'significantly', 'impair', 'reward', 'reversal', 'learn', 'moreover', 'enhanced', 'self', 'report', 'threat', 'rating', 'elevate', 'level', 'support', 'successful', 'induction', 'stressful', 'aversive', 'apprehension', 'finding', 'line', 'literature', 'show', 'stress', 'induce', 'inhibition', 'goal_directe', 'behavior', 'advantage', 'reflexive', 'habitual', 'response', 'style', 'notably', 'reversal', 'learning', 'rapidly', 'restore', 'omission', 'threat', 'several'

In [16]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.048*"scale" + 0.026*"measure" + 0.024*"score" + 0.023*"gad" + '
  '0.022*"item" + 0.022*"age" + 0.021*"assess" + 0.018*"analysis" + '
  '0.018*"factor" + 0.015*"sex"'),
 (1,
  '0.052*"risk" + 0.036*"factor" + 0.033*"association" + 0.024*"limitation" + '
  '0.017*"early" + 0.016*"associate" + 0.016*"ptsd" + 0.015*"follow" + '
  '0.015*"psychiatric" + 0.014*"datum"'),
 (2,
  '0.074*"covid" + 0.032*"psychological" + 0.030*"ci" + 0.030*"pandemic" + '
  '0.029*"prevalence" + 0.023*"stress" + 0.020*"factor" + 0.017*"impact" + '
  '0.017*"survey" + 0.017*"high"'),
 (3,
  '0.030*"sleep" + 0.023*"effect" + 0.020*"response" + 0.014*"increase" + '
  '0.013*"network" + 0.013*"change" + 0.012*"level" + 0.012*"time" + '
  '0.012*"suggest" + 0.011*"brain"'),
 (4,
  '0.091*"woman" + 0.075*"depressive" + 0.067*"student" + 0.064*"pregnancy" + '
  '0.051*"year" + 0.034*"medical" + 0.025*"high" + 0.025*"ci" + '
  '0.024*"perinatal" + 0.023*"stressor"'),
 (5,
  '0.071*"treatment" + 0.048*"interve

In [17]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.393260939353818


In [18]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.35345393129538905


In [19]:
num_topics = 18
topic_words = []
for i in range(num_topics):
    tt = lda_model.get_topic_terms(i,10)
    topic_words.append([id2word[pair[0]] for pair in tt])

In [20]:
for i in range(0,18):
    print(topic_words[i])

['scale', 'measure', 'score', 'gad', 'item', 'age', 'assess', 'analysis', 'factor', 'sex']
['risk', 'factor', 'association', 'limitation', 'early', 'associate', 'ptsd', 'follow', 'psychiatric', 'datum']
['covid', 'psychological', 'ci', 'pandemic', 'prevalence', 'stress', 'factor', 'impact', 'survey', 'high']
['sleep', 'effect', 'response', 'increase', 'network', 'change', 'level', 'time', 'suggest', 'brain']
['woman', 'depressive', 'student', 'pregnancy', 'year', 'medical', 'high', 'ci', 'perinatal', 'stressor']
['treatment', 'intervention', 'group', 'therapy', 'week', 'participant', 'post', 'follow', 'change', 'trial']
['distress', 'psychological', 'cope', 'worker', 'vulnerability', 'transdiagnostic', 'resilience', 'staff', 'pain', 'medical']
['patient', 'clinical', 'high', 'compare', 'depressive', 'non', 'associate', 'score', 'severity', 'characteristic']
['alcohol', 'family', 'loneliness', 'share', 'people', 'case', 'report', 'community', 'question', 'error']
['cognitive', 'trait', 

In [21]:
df=pd.DataFrame({'Year':[],'Source':[],'Topic_ID':[],'Most_freq_words':[]})
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words


In [22]:
df['Most_freq_words']=topic_words

In [23]:
df = df.assign(Year='2020')
df = df.assign(Source='Academic')
df.shape

(18, 4)

In [24]:
ls=[]
for i in range(0,18):
    ls.append(i)
df['Topic_ID']=ls

In [25]:
df.head()

Unnamed: 0,Year,Source,Topic_ID,Most_freq_words
0,2020,Academic,0,"[scale, measure, score, gad, item, age, assess..."
1,2020,Academic,1,"[risk, factor, association, limitation, early,..."
2,2020,Academic,2,"[covid, psychological, ci, pandemic, prevalenc..."
3,2020,Academic,3,"[sleep, effect, response, increase, network, c..."
4,2020,Academic,4,"[woman, depressive, student, pregnancy, year, ..."


In [26]:
df.to_csv("topic_words_j2020.csv")