# LDA models different decades (speeches as input)

In [29]:
#System
import os
import sys
from tqdm import tqdm 

#Data structure manipulation
import pandas as pd
import numpy as np

#text cleaning 
import re
import string

#nlp
import nltk
from nltk.corpus import stopwords
import spacy

#gensim
import gensim
from gensim import corpora
from gensim.test.utils import datapath
# unlist nested lists
from itertools import chain

# count word frequencies
from collections import defaultdict

# vizualization
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

# timer function
import time

## Reading the data

In [2]:
#Path to where the raw text files are stored within the session folders, i.e, converted sessions.
origin_path = "C:/DATA/convSes"

In [3]:
#A function for reading in the speeches
def read_text(file_path, file):
    
    '''Reading the text files'''
    
    with open(file_path, 'r', encoding='utf-8') as file:
        doc=file.read()
    return doc

In [4]:
#Bringing in all the speeches
doc_set = []
for i in range (0,50):
    year = 1970 + i
    session = "session " + str(25+i)+ " - "  + str(year)
    data_path = f"{origin_path}\\{session}"
    os.chdir(data_path)
    for file in os.listdir():
        if file[-4:]=='.txt':
            file_path = f"{data_path}\\{file}"
            doc_set.append({'Year': year, 'ISO_Code': file[:3] , 'text': read_text(file_path, file)})
        else:
            print(file)
            pass

word2vec.model


In [5]:
columns=['Year', 'ISO_Code', 'text']
dataset = pd.concat([pd.DataFrame([i], columns=columns) for i in tqdm(doc_set)], ignore_index=True)
dataset.head()

100%|██████████| 8288/8288 [00:04<00:00, 1946.49it/s]


Unnamed: 0,Year,ISO_Code,text
0,1970,ALB,33: May I first convey to our President the co...
1,1970,ARG,177.\t : It is a fortunate coincidence that pr...
2,1970,AUS,100.\t It is a pleasure for me to extend to y...
3,1970,AUT,155.\t May I begin by expressing to Ambassado...
4,1970,BEL,"176. No doubt each of us, before coming up to ..."


**Creating subset including only the G20 states**

In [6]:
# only select states belonging to the G20 group (minus south afrika and EU representatives)
g20 =  dataset.ISO_Code.isin(['CAN','FRA', 'DEU', 'USA', 'GBR', 'ITA', 'JPN','ARG', 'Aus', 'BRA', 'IND', 'IDN', 
                        'CAN', 'MEX', 'RUS', 'SAU', 'KOR', 'TUR', 'CHN'])
G20 = dataset[g20]

# reset ascending index for subset dataset
G20.reset_index(inplace = True, drop = True)
G20.head()

Unnamed: 0,Year,ISO_Code,text
0,1970,ARG,177.\t : It is a fortunate coincidence that pr...
1,1970,BRA,"1.\tMr. President, I should like, first of all..."
2,1970,CAN,The General Assembly is fortunate indeed to ha...
3,1970,FRA,"84.\t Within one month, when we celebrate the..."
4,1970,GBR,"110.\t Mr. President, I should like first to s..."


In [7]:
# testing a random sample of speeches
G20.sample(10)

Unnamed: 0,Year,ISO_Code,text
629,2008,USA,I am pleased to be here to \naddress the Gener...
404,1995,IDN,On behalf of the Indonesian\ndelegation I shou...
417,1996,DEU,﻿Last year we\ntook stock of half a century of...
132,1978,JPN,﻿\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
564,2004,USA,Thank you for the honour of\naddressing the Ge...
679,2012,BRA,"﻿Once again, a woman’s\nvoice is opening the g..."
512,2001,TUR,﻿This meeting is being held at\na most difficu...
238,1985,FRA,I should like first to congratulate the Presid...
721,2014,JPN,"Humankind faces serious, \nunprecedented crise..."
135,1978,TUR,"﻿\n1.\tMr. President, on behalf of my Governme..."


**Creating subsets for the 70s,80s,90s,2000s,2010s**

In [8]:
G20_70s = G20[(G20['Year'] >= 1970) & (G20['Year'] < 1980)] 
G20_80s = G20[(G20['Year'] >= 1980) & (G20['Year'] < 1990)]
G20_90s = G20[(G20['Year'] >= 1990) & (G20['Year'] < 2000)]
G20_2000s = G20[(G20['Year'] >= 2000) & (G20['Year'] < 2010)]
G20_2010s = G20[(G20['Year'] >= 2010) & (G20['Year'] < 2020)]

## Pre-processing

**Defining/loading in the needed functions --> can we do this in a cleaner way??**

In [11]:
nlp = spacy.load('en_core_web_lg')#run in conda to download the library --> python -m download en_core_web_lg 

In [12]:
def init_proc(text, stop_words=[]):
    
    '''Pre-processing the input in single list'''
    
    stops = stopwords.words("english")
    stops.extend(stop_words)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\-', ' ', text)
    text = re.sub(r'\s\s+', ' ', text)
    text = re.sub(r'[0-9]+','', text)
    text = re.sub(r'[^\w\s]', '', text)
    lower = text.lower()
    doc = nlp(lower)
    words = []
    for token in doc:
        lemma = token.lemma_
        if lemma not in stops:
                words.append(lemma)
    return words

In [13]:
stop_words =['general', 'assembly', 'conference', 'session', 'congratulations', 
             'congratulate', 'secretarygeneral','members', 'member', 'united', 'nations', 
             'nation', 'statement', 'honour','every', 'sir', 'majesty', 'president', 
             'minister', 'prime', 'ambassador', 'thank', 'thanks', 'world', 'international', 
             'states', 'we', 'us', 'they', 'system', 'organization','say', 'think', 'know', 
             'want', 'need', 'let', 'ask', 'go', 'look', 'stand', 'open', 'give', 'see', 'come', 
             'make', 'made', 'meet','act', 'use', 'take', 'bring', 'ensure', 'able', 'assume', 
             'continue', 'change', 'progress', 'process', 'year', 'years', 'time', 'today',  
             'would', 'will', 'might', 'together', 'common', 'future', 'one', 'order', 'end', 
             'new', 'necessary', 'major', 'minor', 'many', 'people', 'peoples', 'appropriate', 
             'historic', 'adequate', 'best', 'better', 'confident', 'important', 'special',
             'great', 'therefore', 'thus', 'hence', 'like', 'particularly', 'many', 'much', 
             'greater', 'especially', 'towards', 'always', 'whether', 'around',
             'possible', 'clear', 'simply', 'must', 'also', 'however', 'mr',
             'united', 'kingdom', 'great', 'britain', 'france', 'germany','italy', 'japan',
             'canada', 'usa', 'argentina', 'australia', 'china', 'brazil', 'india', 'indonesia',  
             'mexico', 'russia', 'saudi', 'arabia', 'south', 'korea', 'turkey','liechtenstein',
             'I', ' ', '  ']

In [18]:
#loading the bigram models from the bigram_trigram_model_notebooks
#temp_file1 = datapath("C:/DATA/DTM/phrasers/bigram_speeches")
#temp_file2 = datapath("C:/DATA/DTM/phrasers/trigram_speeches")
from gensim.models.phrases import Phrases
bigram_phraser = Phrases.load("C:/DATA/DTM/phrasers/bigram_speeches")
trigam_phraser = Phrases.load("C:/DATA/DTM/phrasers/trigram_speeches")

In [20]:
#Function to preprocess speeches and also detect bi and trigrams
def pre_proc_comb(corpus,stop_words=[]):
    
    '''Looping the pre-processing over a list. Also checks if input is correct'''
    
    l = []
    if isinstance(corpus, str):
        l.append(init_proc(corpus,stop_words))
    elif all(isinstance(s, str) for s in corpus):    
        for item in tqdm(corpus):
            l.append(init_proc(item,stop_words))
    else:
        print("Error: This function only accepts strings or a list of strings.")
    bigram_token = []
    for j in l:
        bigram_token.append(bigram_phraser[j])
    trigram_token = []
    for i in bigram_token:
        trigram_token.append(trigam_phraser[i])
    return trigram_token

**Preprocessing the 5 dataslices**

In [23]:
#70s
text_corpus_70s = G20_70s['text'].values.tolist()
processed_corpus_70s = pre_proc_comb(text_corpus_70s,stop_words)

100%|██████████| 153/153 [01:51<00:00,  1.37it/s]


In [24]:
#80s
text_corpus_80s = G20_80s['text'].values.tolist()
processed_corpus_80s = pre_proc_comb(text_corpus_80s,stop_words)

100%|██████████| 160/160 [02:05<00:00,  1.27it/s]


In [25]:
#90s
text_corpus_90s = G20_90s['text'].values.tolist()
processed_corpus_90s = pre_proc_comb(text_corpus_90s,stop_words)

100%|██████████| 167/167 [01:27<00:00,  1.91it/s]


In [26]:
#2000s
text_corpus_2000s = G20_2000s['text'].values.tolist()
processed_corpus_2000s = pre_proc_comb(text_corpus_2000s,stop_words)

100%|██████████| 166/166 [00:53<00:00,  3.13it/s]


In [27]:
#2010s
text_corpus_2010s = G20_2010s['text'].values.tolist()
processed_corpus_2010s = pre_proc_comb(text_corpus_2010s,stop_words)

100%|██████████| 166/166 [01:04<00:00,  2.59it/s]


## Building the LDA models for every decade sice 

### 70s

In [33]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_70s = corpora.Dictionary(processed_corpus_70s)
doc_term_matrix_70s = [dictionary_70s.doc2bow(speech) for speech in processed_corpus_70s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_70s = LDA(corpus = doc_term_matrix_70s, id2word = dictionary_70s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 30min 11s


In [34]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/70s")
lda_model_70s.save(temp_file)

In [35]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/70s")
lda_model_70s = LDA.load(temp_file)

In [None]:
#investigating the word distribution over the different topics
#lda_model_70s.print_topics()

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [None]:
vis70s = gensimvis.prepare(lda_model_70s,doc_term_matrix_70s,dictionary_70s,mds= 'mmds', R=30)
vis70s

### 80s

In [37]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_80s = corpora.Dictionary(processed_corpus_80s)
doc_term_matrix_80s = [dictionary_80s.doc2bow(speech) for speech in processed_corpus_80s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_80s = LDA(corpus = doc_term_matrix_80s, id2word = dictionary_80s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 32min 45s


In [38]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/80s")
lda_model_80s.save(temp_file)

In [39]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/80s")
lda_model_80s = LDA.load(temp_file)

In [None]:
#investigating the word distribution over the different topics
#lda_model_80s.print_topics()

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [41]:
vis80s = gensimvis.prepare(lda_model_80s,doc_term_matrix_80s,dictionary_80s,mds= 'mmds', R=30)
vis80s

### 90s

In [42]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_90s = corpora.Dictionary(processed_corpus_90s)
doc_term_matrix_90s = [dictionary_90s.doc2bow(speech) for speech in processed_corpus_90s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_90s = LDA(corpus = doc_term_matrix_90s, id2word = dictionary_90s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 25min 3s


In [43]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/90s")
lda_model_90s.save(temp_file)

In [44]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/90s")
lda_model_90s = LDA.load(temp_file)

In [None]:
#investigating the word distribution over the different topics
#lda_model_90s.print_topics()

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [45]:
vis90s = gensimvis.prepare(lda_model_90s,doc_term_matrix_90s,dictionary_90s,mds= 'mmds', R=30)
vis90s

### 2000s

In [46]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_2000s = corpora.Dictionary(processed_corpus_2000s)
doc_term_matrix_2000s = [dictionary_2000s.doc2bow(speech) for speech in processed_corpus_2000s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_2000s = LDA(corpus = doc_term_matrix_2000s, id2word = dictionary_2000s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 13min 52s


In [47]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2000s")
lda_model_2000s.save(temp_file)

In [48]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2000s")
lda_model_2000s = LDA.load(temp_file)

In [49]:
#investigating the word distribution over the different topics
#lda_model_2000s.print_topics()

In [50]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [51]:
vis2000s = gensimvis.prepare(lda_model_2000s,doc_term_matrix_2000s,dictionary_2000s,mds= 'mmds', R=30)
vis2000s

### 2010s

In [52]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_2010s = corpora.Dictionary(processed_corpus_2010s)
doc_term_matrix_2010s = [dictionary_2010s.doc2bow(speech) for speech in processed_corpus_2010s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_2010s = LDA(corpus = doc_term_matrix_2010s, id2word = dictionary_2010s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 18min 4s


In [53]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2010s")
lda_model_2010s.save(temp_file)

In [54]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2010s")
lda_model_2010s = LDA.load(temp_file)

In [55]:
#investigating the word distribution over the different topics
#lda_model_2010s.print_topics()

In [56]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [57]:
vis2010s = gensimvis.prepare(lda_model_2010s,doc_term_matrix_2010s,dictionary_2010s,mds= 'mmds', R=30)
vis2010s