# LDA Model Building 

## Topic modelling using tweets of all users


In [1]:
import pandas as pd
import numpy as np
import re

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore

import spacy
import en_core_web_lg

from tqdm import tqdm_notebook as tqdm


# Visualisations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

## Loading Data

In [19]:
tweet_df = pd.read_csv('Data/tweet_climate_change_processed.csv')
tweet_df

Unnamed: 0,ID,Name,Tweet,Processed_text,Processed_token,Bigram
0,18257804,stateless,"we’re pretty lucky, all things considered, whe...",pretty lucky thing consider compare place clim...,"['pretty', 'lucky', 'thing', 'consider', 'comp...","['pretty', 'lucky', 'thing', 'consider', 'comp..."
1,1246325069841723392,TsaiJilly,#UN75 survey found that respondents in all reg...,un survey find respondent region identify cl...,"['un', 'survey', 'find', 'respondent', 'region...","['find', 'respondent', 'region', 'identify', '..."
2,1248988647812222978,Beatric54184322,"All hat, no policy #climatechange #insiders ht...",hat policy climatechange insider,"['hat', 'policy', 'climatechange', 'insider']","['hat', 'policy', 'climatechange', 'insider']"
3,240072798,LugubriousLarry,Two great stories on #Maine this weekend: firs...,great story maine weekend important piece jan...,"['great', 'story', 'maine', 'weekend', 'import...","['great', 'story', 'maine', 'weekend', 'import..."
4,1124447266205503488,All435Reps,The evidence is right in front of us. Temperat...,evidence right temperature get hot climatechan...,"['evidence', 'right', 'temperature', 'get', 'h...","['evidence', 'right', 'temperature', 'get', 'h..."
...,...,...,...,...,...,...
67401,1329260903527718913,Jillian18277886,"Once we get through #COVID, let's not forget w...",covid let forget go die climatechange,"['covid', 'let', 'forget', 'go', 'die', 'clima...","['let', 'forget', 'go', 'die', 'climatechange']"
67402,19435213,wildweatherdan,"At the end of the last Ice Age, people changed...",end ice age people change clothe literally cli...,"['end', 'ice', 'age', 'people', 'change', 'clo...","['end', 'ice', 'age', 'people', 'change', 'clo..."
67403,1401488848635502592,AnonWatchers,"Coral #reefs ,rainforest of the ocean\n🐠\n#cli...",coral reef rainforest ocean climatechange co...,"['coral', 'reef', 'rainforest', 'ocean', 'clim...","['coral_reef', 'rainforest', 'climatechange', ..."
67404,18027211,dennissweatt,"What is #climatechange? \n\nWets are wetter, d...",climatechange wet wetter dry dryer hot hot col...,"['climatechange', 'wet', 'wetter', 'dry', 'dry...","['climatechange', 'wet', 'wetter', 'dry', 'dry..."


## Data Transformation : Corpus and Dictionary 

In [135]:
# Convert stringfied list back to list
tweet_df['Processed_token'] = tweet_df['Processed_token'].apply(eval)

In [137]:
# Create a mapping of word IDs to words
id2word = corpora.Dictionary(tweet_df['Processed_token'])
print(len(id2word))

36785


In [138]:
# Filter extremes
id2word.filter_extremes(no_below = 3, no_above = .99)
print(len(id2word))

20608


In [139]:
# Turns each tweet into a bag of words
corpus = [id2word.doc2bow(tweet) for tweet in tweet_df['Processed_token']]

### Bigram 

In [20]:
# Convert stringfied list back to list
tweet_df['Bigram'] = tweet_df['Bigram'].apply(eval)

In [21]:
# Create a mapping of word IDs to words
id2word_bi = corpora.Dictionary(tweet_df['Bigram'])
print(len(id2word_bi))

28650


In [22]:
# Filter extremes
id2word_bi.filter_extremes(no_below = 3, no_above = .99)
print(len(id2word_bi))

16988


In [23]:
# Turns each tweet into a bag of words
corpus_bi = [id2word_bi.doc2bow(tweet) for tweet in tweet_df['Bigram']]

## Base Model 

- Dirichlet hyperparameter alpha: Document-Topic Density = 1.0(default)

- Dirichlet hyperparameter beta: Word-Topic Density = 1.0(default)


In [12]:
# LDA model
base_lda = LdaMulticore(corpus = corpus, num_topics = 10, 
                       id2word = id2word, random_state = 100,
                       chunksize = 100, passes = 10, 
                        per_word_topics = True)

In [13]:
# Filter for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_lda.print_topics()]

In [14]:
# Create topics
topics = [' '.join(t[0:10]) for t in words]

In [15]:
# Get the topics
for id, t in enumerate(topics):
    print(f"------Topic {id} ------")
    print(t, end = "\n\n")

------Topic 0 ------
sustainability environment energy nature climateaction renewableenergy climateemergency renewable actonclimate climatecrisis

------Topic 1 ------
climate climatecrisis change climateaction people c climateemergency global world cause

------Topic 2 ------
climate heat climatecrisis heatwave change record year extreme weather temperature

------Topic 3 ------
need fuel study fossil human climate eu carbon burn g

------Topic 4 ------
cop renewable june state ev energy tesla finance renewableenergy implement

------Topic 5 ------
health space know good action news tackle reach set environment

------Topic 6 ------
normal sea rise level world discuss w decade fight unep

------Topic 7 ------
climate change science impact solution join community esg july new

------Topic 8 ------
carbon forest climate change tree co world emission climatecrisis plant

------Topic 9 ------
water drought wildfire fire crisis ocean climatecrisis lead heat process



### Model Visualisation

In [17]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
gensimvis.prepare(base_lda, corpus, id2word)

### Model Evaluation

In [140]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_lda.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_lda, texts= tweet_df['Processed_token'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -8.534126351796964

Coherence Score:  0.329901875555909


### Bigram 

In [25]:
# LDA model
base_lda_bi = LdaMulticore(corpus = corpus_bi, num_topics = 10, 
                       id2word = id2word_bi, random_state = 100,
                       chunksize = 100, passes = 10, 
                        per_word_topics = True)

#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
gensimvis.prepare(base_lda_bi, corpus_bi, id2word_bi)


# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity_bi = base_lda_bi.log_perplexity(corpus_bi)
print('\nPerplexity: ', base_perplexity_bi) 

# Compute Coherence Score
coherence_model_bi = CoherenceModel(model=base_lda_bi, texts= tweet_df['Bigram'], 
                                   dictionary=id2word_bi, coherence='c_v')
coherence_lda_model_base_bi = coherence_model_bi.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base_bi)


Perplexity:  -8.085208900341186

Coherence Score:  0.33407135777190855


## Hyperparameter Tuning

### Parameters to tune

1. Number of Topics (K)

2. Dirichlet hyperparameter alpha: Document-Topic Density (a)

3. Dirichlet hyperparameter beta: Word-Topic Density (b)

In [141]:
def coherence_score(corpus, id2word, k, a, b):
    
    lda_model = LdaMulticore(corpus = corpus,  
                             id2word = id2word,
                             num_topics = k,
                             alpha = a,
                             eta = b,
                             random_state = 100,
                             chunksize = 100, 
                             passes = 10)
    coherence_model_lda = CoherenceModel(model = lda_model,
                                        texts = tweet_df['Processed_token'],
                                        dictionary = id2word,
                                        coherence = 'c_v')
    
    return coherence_model_lda.get_coherence()

In [142]:
# Topic number k range
min_topics = 3
max_topics = 12
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha 
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

model_results_ = {'Topics' : [],
                'Alpha' : [],
                'Beta' : [],
                'Coherence' : []
                }


if 1 == 1:
    pbar = tqdm(total = 270)
    
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterate through beta values
            for b in beta:
                c = coherence_score(corpus = corpus, 
                                    id2word = id2word,
                                    k = k, a = a, b = b)
                
                model_results_['Topics'].append(k)
                model_results_['Alpha'].append(a)
                model_results_['Beta'].append(b)
                model_results_['Coherence'].append(c)
                
                pbar.update(1)
                
model_result_ = pd.DataFrame(model_results_)
pbar.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  pbar = tqdm(total = 270)


  0%|          | 0/270 [00:00<?, ?it/s]

In [144]:
model_result_.sort_values(by = ['Coherence'], ascending = False)

Unnamed: 0,Topics,Alpha,Beta,Coherence
177,8,asymmetric,0.61,0.468267
98,6,0.31,0.91,0.445448
86,5,asymmetric,0.31,0.431366
243,11,0.01,0.91,0.428462
113,6,symmetric,0.91,0.423610
...,...,...,...,...
1,3,0.01,0.31,0.233670
0,3,0.01,0.01,0.230794
8,3,0.31,0.91,0.203296
2,3,0.01,0.61,0.200856


In [146]:
model_result_.to_csv('Model_results/LDA_tuning_results.csv', index = False)

### Bigram

In [27]:
def coherence_score(corpus, id2word, k, a, b):
    
    lda_model = LdaMulticore(corpus = corpus,  
                             id2word = id2word,
                             num_topics = k,
                             alpha = a,
                             eta = b,
                             random_state = 100,
                             chunksize = 100, 
                             passes = 10)
    coherence_model_lda = CoherenceModel(model = lda_model,
                                        texts = tweet_df['Bigram'],
                                        dictionary = id2word,
                                        coherence = 'c_v')
    
    return coherence_model_lda.get_coherence()

In [45]:
# Topic number k range
min_topics = 3
max_topics = 12
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha 
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

model_results = {'Topics' : [],
                'Alpha' : [],
                'Beta' : [],
                'Coherence' : []
                }


if 1 == 1:
    pbar = tqdm(total = 270)
    
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterate through beta values
            for b in beta:
                c = coherence_score(corpus = corpus_bi, 
                                    id2word = id2word_bi,
                                    k = k, a = a, b = b)
                
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(c)
                
                pbar.update(1)
                
model_result = pd.DataFrame(model_results)
pbar.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  pbar = tqdm(total = 270)


  0%|          | 0/270 [00:00<?, ?it/s]

In [145]:
model_result = pd.DataFrame(model_results)
model_result.to_csv('Model_results/LDA_tuning_results_bi.csv', index = False)

In [97]:
model_result.sort_values(by = ['Coherence'], ascending = False)

Unnamed: 0,Topics,Alpha,Beta,Coherence
44,4,0.61,symmetric,0.388452
117,6,asymmetric,0.61,0.382405
131,7,0.61,0.31,0.374976
191,9,0.61,0.31,0.374794
76,5,0.91,0.31,0.371659
...,...,...,...,...
26,3,asymmetric,0.31,0.281213
61,5,0.01,0.31,0.276921
64,5,0.01,symmetric,0.272813
27,3,asymmetric,0.61,0.261185


## Best Model

In [165]:
# LDA model 
best_lda = LdaMulticore(corpus = corpus, 
                           num_topics = 8, 
                           id2word = id2word, 
                           random_state = 100,
                           chunksize = 100,
                           passes = 10, 
                          alpha = 'asymmetric',
                          eta = 0.61)

#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
vis =  gensimvis.prepare(best_lda, corpus, id2word)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [166]:
vis

In [167]:
pyLDAvis.save_html(vis, 'Visualisation/best_model_vis.html')

### Bigram

In [98]:
# LDA model
best_lda_bi = LdaMulticore(corpus = corpus_bi, 
                           num_topics = 4, 
                           id2word = id2word_bi, 
                           random_state = 100,
                           chunksize = 100,
                           passes = 10, 
                          alpha = 0.61,
                          eta = 'symmetric')

#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
gensimvis.prepare(best_lda_bi, corpus_bi, id2word_bi)

In [128]:
# LDA model
best2_lda_bi = LdaMulticore(corpus = corpus_bi, 
                           num_topics = 6, 
                           id2word = id2word_bi, 
                           random_state = 100,
                           chunksize = 100,
                           passes = 10, 
                          alpha = 'asymmetric',
                          eta = 0.61)

#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(best2_lda_bi, corpus_bi, id2word_bi)
vis