# Topic modelling 

In this notebook, we apply topic modelling algorithms to our data. 

## Import statements

In [59]:
import numpy as np 
import pandas as pd 
import nltk

from nltk.stem import WordNetLemmatizer
from gensim import models, matutils
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
nltk.download('omw-1.4') #  Open Multilingual Wordnet
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /Users/yinghu/nltk_data...
[nltk_data] Downloading package wordnet to /Users/yinghu/nltk_data...


True

# Model 1: Latent Dirichlet Allocation (LDA)

#### Load the data done from the previous work

In [2]:
pwd

'/Users/yinghu/Documents/GitHub/fourthBrain/glg-project/week_8_models'

In [11]:
extended_df = pd.read_csv('../datasets/extended_df.csv')
sentences_df = pd.read_csv('../datasets/sentences_v2.csv')

In [4]:
extended_df.head()

Unnamed: 0.1,Unnamed: 0,Sentence #,Word,POS,Tag,WordLength,Capital,Non-Punctuation,StopWord,IsNER
0,0,Sentence: 1,Thousands,NNS,O,9,True,True,False,0
1,1,,of,IN,O,2,False,True,True,0
2,2,,demonstrators,NNS,O,13,False,True,False,0
3,3,,have,VBP,O,4,False,True,True,0
4,4,,marched,VBN,O,7,False,True,False,0


In [12]:
sentences_df.head()

Unnamed: 0.1,Unnamed: 0,Sentence Length,Sentence#,Content,Tagged Words,Shortened Sentences
0,0,24,1,"['Thousands', 'of', 'demonstrators', 'have', '...","['London', 'Iraq', 'British']",Thousands demonstrators marched London pro...
1,1,30,2,"['Families', 'of', 'soldiers', 'killed', 'in',...",['Bush'],Families soldiers killed conflict joined p...
2,2,14,3,"['They', 'marched', 'from', 'the', 'Houses', '...","['Hyde', 'Park']",marched Houses Parliament rally Hyde Pa...
3,3,15,4,"['Police', 'put', 'the', 'number', 'of', 'marc...",,"Police number marchers 10,000 organizers ..."
4,4,25,5,"['The', 'protest', 'comes', 'on', 'the', 'eve'...","['Britain', 'Labor', 'Party', 'English', 'Brig...",protest comes eve annual conference Brit...


#### Vectorization and Lemmatization

In [20]:
shortened_sent = sentences_df['Shortened Sentences']
shortened_sent[shortened_sent.isna()]

8411    NaN
Name: Shortened Sentences, dtype: object

Drop sentence 8411

In [21]:
shortened_sent.drop(8411, inplace=True)
shortened_sent.isna().sum()

0

Define the `LemmaTokenizer` class

In [22]:
class LemmaTokenizer:
    def __init__(self) -> None:
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in doc.lower().split(' ')]

In [23]:
shortened_sent.shape

(47958,)

In [26]:
bow_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer())

In [27]:
shortened_sent_bow = bow_vectorizer.fit_transform(shortened_sent)

In [28]:
shortened_sent_bow.shape

(47958, 28760)

In [60]:
id2word = {idx: word for word, idx in bow_vectorizer.vocabulary_.items()}
corpus = shortened_sent_bow.transpose()
corpus = matutils.Sparse2Corpus(corpus)

In [41]:
corpus.shape 

(28760, 47958)

In [61]:
lad = models.LdaModel(corpus, num_topics=10, id2word=id2word)

In [63]:
lad.print_topics()

[(0,
  '0.166*"charge" + 0.033*"destroyed" + 0.029*"denies" + 0.014*"highly" + 0.008*"bombed" + 0.007*"warplane" + 0.005*"classified" + 0.004*"garang" + 0.003*"hectare" + 0.002*"zuma"'),
 (1,
  '0.329*"" + 0.046*"." + 0.041*"," + 0.024*"president" + 0.019*"minister" + 0.017*"mr." + 0.009*"prime" + 0.007*"leader" + 0.006*"bush" + 0.006*"talk"'),
 (2,
  '0.467*"" + 0.051*"." + 0.036*"," + 0.006*"said" + 0.004*"year" + 0.004*"say" + 0.004*"official" + 0.003*"government" + 0.003*"country" + 0.003*"people"'),
 (3,
  '0.048*"pakistan" + 0.039*"deputy" + 0.032*"injury" + 0.031*"damage" + 0.027*"tribal" + 0.027*"paul" + 0.021*"jet" + 0.018*"region" + 0.017*"waziristan" + 0.016*"northwestern"'),
 (4,
  '0.038*"demonstration" + 0.036*"olympic" + 0.022*"consumer" + 0.021*"index" + 0.017*"staged" + 0.015*"prominent" + 0.015*"confidence" + 0.014*"illegally" + 0.013*"winter" + 0.011*"slightly"'),
 (5,
  '0.273*"" + 0.047*"," + 0.040*"." + 0.019*"election" + 0.015*"party" + 0.012*")" + 0.012*"(" + 0.