# Topic Modelling using LDA (Latent  Dirichlet Allocation)

In [1]:
import eland as ed
from eland.conftest import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from nltk.corpus import stopwords
import pyLDAvis.gensim
import pickle 
import pyLDAvis
from pprint import pprint
import gensim.corpora as corpora
from gensim.models import CoherenceModel, Phrases
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

### Importing the data from Elasticsearch

In [9]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text_processed', 'user_id', 'verified', 'name', 'location', 'entities.hashtags.text', 'entities.user_mentions.name'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [10]:
df_tweets.head()

Unnamed: 0,full_text_processed,user_id,verified,name,location,entities.hashtags.text,entities.user_mentions.name
1264160647002103808,praying everyone affected condolence family vi...,1256622599364214786,False,The Meraaki,"Ahmadabad City, India",AmphanSuperCyclone,
1264160609668599808,cyclone ampan people satkhira upset due lack w...,1251934220345208832,False,Newspapers,Dhaka,,
1264121161589415936,cyclone amphan ha completely destroyed agricul...,1251934220345208832,False,Newspapers,Dhaka,,
1264160569315209216,amphan cyclone ​​cm mamta demand ban labor spe...,1113075640499036160,False,netvani,,,
1264114187346874368,amfan storm caused devastation bengal mp nusra...,1113075640499036160,False,netvani,,,


## Tokenising and removing short tweets (less than 4 words) 

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [12]:
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: remove_emoji(x))
df_tweets['full_text_tokens'] = df_tweets['full_text_processed'].apply(lambda x: [w for w in x.split()])
df_tweets['length'] = df_tweets['full_text_tokens'].apply(lambda x: len(x))
df_tweets = df_tweets[df_tweets['length']>4]

In [13]:
df_tweets.head()

Unnamed: 0,full_text_processed,user_id,verified,name,location,entities.hashtags.text,entities.user_mentions.name,full_text_tokens,length
1264160647002103808,praying everyone affected condolence family vi...,1256622599364214786,False,The Meraaki,"Ahmadabad City, India",AmphanSuperCyclone,,"[praying, everyone, affected, condolence, fami...",8
1264160609668599808,cyclone ampan people satkhira upset due lack w...,1251934220345208832,False,Newspapers,Dhaka,,,"[cyclone, ampan, people, satkhira, upset, due,...",9
1264121161589415936,cyclone amphan ha completely destroyed agricul...,1251934220345208832,False,Newspapers,Dhaka,,,"[cyclone, amphan, ha, completely, destroyed, a...",9
1264160569315209216,amphan cyclone ​​cm mamta demand ban labor spe...,1113075640499036160,False,netvani,,,,"[amphan, cyclone, ​​cm, mamta, demand, ban, la...",15
1264114187346874368,amfan storm caused devastation bengal mp nusra...,1113075640499036160,False,netvani,,,,"[amfan, storm, caused, devastation, bengal, mp...",9


## Building Bigram and Trigram models

In [15]:
stop_words = stopwords.words('english')
stop_words.extend(['from','not', 'would', 'say', 'could', '_', 'be', 'go', 'do', 'rather', 'seem', 'due', 'via', 'done', 'said'])

tweets_list = df_tweets.full_text_tokens.to_list()
tweet_ids = df_tweets.index.to_list()

# Build the bigram and trigram models

bigram = Phrases(tweets_list, min_count=10, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[tweets_list], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [16]:
tweets = [[word for word in gensim.utils.simple_preprocess(str(tweet))] for tweet in tweets_list]
tweets = [bigram_mod[tweet] for tweet in tweets]
tweets = [trigram_mod[bigram_mod[tweet]] for tweet in tweets]

## Building the Topic Model

In [17]:
# Create Dictionary
tweets_dict = corpora.Dictionary(tweets)

# Filtering extremes by removing tokens occuring in less than 10 tweets and have occured in more than 90% tweets
tweets_dict.filter_extremes(no_below=10, no_above=0.9)

# Create Corpus: Term Document Frequency
corpus = [tweets_dict.doc2bow(twt) for twt in tweets]

# Adding the TF-IDF for better insight 
tfidf = gensim.models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

## Based on Hyperparameter optimization - trying 2 approaches:
- Topics = 6, Alpha = 0.01
- Topics = 10, Alpha = 1

In [29]:
# LDA Model Parameters

NUM_TOPICS_1 = 10
ALPHA_1 = 1
NUM_TOPICS_2 = 6
ALPHA_2 = 0.01

In [30]:
def lda_model_build(corpus, dictionary, topics, alpha, texts):
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=topics, 
                                            random_state=100,
                                            passes=10,
                                            alpha=alpha,
                                            per_word_topics=True)
    
    print("\nModel, Topics=",topics)
    pprint(lda_model.print_topics())
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    return lda_model, coherence_lda

In [20]:
# Build first model - Topics=10
lda_model_1, score_1 = lda_model_build(corpus=tfidf_corpus, dictionary=tweets_dict, topics=NUM_TOPICS_1, alpha=ALPHA_1, texts=tweets)

# Build first model - Topics=6
lda_model_2, score_2 = lda_model_build(corpus=tfidf_corpus, dictionary=tweets_dict, topics=NUM_TOPICS_2, alpha=ALPHA_2, texts=tweets)


Model, Topics= 10
[(0,
  '0.038*"kolkata" + 0.023*"hurricane" + 0.022*"day" + 0.021*"disaster" + '
  '0.019*"indian" + 0.018*"devastated" + 0.017*"many" + 0.015*"see" + '
  '0.013*"part" + 0.013*"loss"'),
 (1,
  '0.040*"ha" + 0.032*"amp" + 0.031*"help" + 0.025*"people" + 0.020*"bjp" + '
  '0.018*"please" + 0.015*"home" + 0.015*"family" + 0.013*"victim" + '
  '0.012*"get"'),
 (2,
  '0.038*"time" + 0.033*"covid" + 0.016*"house" + 0.016*"due" + 0.015*"crisis" '
  '+ 0.013*"coming" + 0.012*"lockdown" + 0.011*"devastating" + 0.011*"made" + '
  '0.010*"already"'),
 (3,
  '0.036*"via" + 0.032*"damage" + 0.030*"relief" + 0.027*"government" + '
  '0.026*"amphan" + 0.026*"cyclone" + 0.025*"caused" + 0.023*"devastation" + '
  '0.022*"live" + 0.021*"update"'),
 (4,
  '0.083*"india" + 0.063*"bangladesh" + 0.048*"cyclone" + 0.033*"amphan" + '
  '0.025*"dead" + 0.025*"people" + 0.024*"million" + 0.023*"make" + '
  '0.023*"landfall" + 0.020*"least"'),
 (5,
  '0.021*"corona" + 0.017*"like" + 0.013*"co

In [22]:
## Coherence Scores

print("Model 1 - Topics = 10, Score =",score_1)
print("Model 2 - Topics = 6, Score =",score_2)

Model 1 - Topics = 10, Score = 0.4445514737374384


## Further analysis on Model 1 (Topics = 10)
Addressing certain questions and extracting more information out of the topics

## Extracting Dominant Topic for each tweet and its percentage contribution

In [23]:
def get_dominant_topic(lda_model, corpus):
    tweet_topics = []
    tweet_topics_percent = []
    for tweet in tfidf_corpus:
        topics_dist = lda_model.get_document_topics(tweet)
        dom_topic, percent = max(topics_dist, key=lambda item:item[1])
        tweet_topics.append(dom_topic)
        tweet_topics_percent.append(percent)
    return tweet_topics, tweet_topics_percent

In [41]:
tweet_topics, tweet_topics_percent = get_dominant_topic(lda_model_1, tfidf_corpus) ## Storing the topic assignments for each tweet

In [42]:
tweet_topics_df = pd.DataFrame(list(zip(tweet_topics, tweet_topics_percent)), columns=['Topic', 'Percentage Contribution'], index=tweet_ids)

In [43]:
tweet_topics_df.tail()

Unnamed: 0,Topic,Percentage Contribution
1268051227734085632,3,0.148246
1268051184629223424,6,0.140369
1268051099086356480,0,0.120297
1268051025891430400,0,0.15002
1268050995482963968,1,0.211865


In [46]:
tweet_topics_df = pd.concat([df_tweets, tweet_topics_df], axis=1)

In [47]:
tweet_topics_df.head()

Unnamed: 0,full_text_processed,user_id,verified,name,location,entities.hashtags.text,entities.user_mentions.name,full_text_tokens,length,Topic,Percentage Contribution
1264160647002103808,praying everyone affected condolence family vi...,1256622599364214786,False,The Meraaki,"Ahmadabad City, India",AmphanSuperCyclone,,"[praying, everyone, affected, condolence, fami...",8,1,0.158338
1264160609668599808,cyclone ampan people satkhira upset due lack w...,1251934220345208832,False,Newspapers,Dhaka,,,"[cyclone, ampan, people, satkhira, upset, due,...",9,0,0.178125
1264121161589415936,cyclone amphan ha completely destroyed agricul...,1251934220345208832,False,Newspapers,Dhaka,,,"[cyclone, amphan, ha, completely, destroyed, a...",9,6,0.186731
1264160569315209216,amphan cyclone ​​cm mamta demand ban labor spe...,1113075640499036160,False,netvani,,,,"[amphan, cyclone, ​​cm, mamta, demand, ban, la...",15,7,0.182278
1264114187346874368,amfan storm caused devastation bengal mp nusra...,1113075640499036160,False,netvani,,,,"[amfan, storm, caused, devastation, bengal, mp...",9,6,0.163401


In [201]:
## Saving the dataframe as a csv for future analysis

tweet_topics_df.to_csv('../data/interim/tweet_topics_data_lda.csv')

## Number of Tweets for Each Topic

In [204]:
top_topics = tweet_topics_df.groupby('Topic')\
    .size()\
    .to_frame()\
    .reset_index()\
    .rename(columns={0:'Count', 'Topic':'Topic'})\
    .set_index('Topic')\
    .nlargest(20, 'Count')

top_topics

Unnamed: 0_level_0,Count
Topic,Unnamed: 1_level_1
9,11110
4,10793
5,10188
1,10181
7,9696
6,9664
3,8984
8,8537
0,8508
2,7676


## Most Representative tweet for each topic 
Tweet with highest contribution by the corresponding topic

In [88]:
g = tweet_topics_df.groupby('Topic').apply(lambda x: x.sort_values('Percentage Contribution', ascending=False)).reset_index(drop=True)

In [214]:
g.groupby('Topic').head(1)[['Topic', 'full_text_processed', 'name']].set_index('Topic')

Unnamed: 0_level_0,full_text_processed,name
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,despite early warning hurricane amfan mamata a...,Chowkidar Archya Midya
1,first cyclone hard situation back front ha arr...,Moloy Ghatak
2,thank sharing detail apology delay restricted ...,Tata Sky
3,npr budget 3941 cr delhi power corridor 20000 ...,Citizen Bapan Das
4,least 12 dead three million evacuee india bang...,Síntesis TV
5,786rizwankhan ja bhai apna kam kar kisne bola ...,Mohammed kaif
6,climate crisis isnt coming cyclone wa latest d...,Carmel Boyd
7,tomorrow pm narendra modi ji travel west benga...,PadmalochanPanda
8,year saw corona watched earthquake saw amphons...,aniltripathi
9,sucs amphan 120 km east paradip odisha 1030 is...,मौसम


## Top N users for each topic
Extracting the top N users for each topic by count of tweets for every user attributed to a topic

In [239]:
topic_users = tweet_topics_df.groupby(['Topic'])['name'].apply(lambda x: x.value_counts().head(10)).to_frame()
topic_users

Unnamed: 0_level_0,Unnamed: 1_level_0,name
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,The Wealth Home - Let's start building wealth,253
0,MEDIAonINDIA,140
0,ABP Ananda,121
0,Oneindia Bengali,37
0,News18Bangla,35
0,Hindustan Times,35
0,CPI(M) WEST BENGAL,32
0,sujoy pal,31
0,S Newz,26
0,Newspapers,23


## Visualize the topics


In [23]:
pyLDAvis.enable_notebook()
LDAvis_prepared_1 = pyLDAvis.gensim.prepare(lda_model_1, tfidf_corpus, tweets_dict)
LDAvis_prepared_2 = pyLDAvis.gensim.prepare(lda_model_2, tfidf_corpus, tweets_dict)

In [24]:
## Saving the HTML
pyLDAvis.save_html(LDAvis_prepared_1, '../reports/figures/LDA_topic_10.html')
pyLDAvis.save_html(LDAvis_prepared_2, '../reports/figures/LDA_topic_6.html')