# LDA models different decades (paragraphs as input)

In [2]:
#System
import os
import sys
from tqdm import tqdm 

#Data structure manipulation
import pandas as pd
import numpy as np

#text cleaning 
import re
import string

#nlp
import nltk
from nltk.corpus import stopwords
import spacy

#gensim
import gensim
from gensim import corpora
from gensim.test.utils import datapath
# unlist nested lists
from itertools import chain

# count word frequencies
from collections import defaultdict

# vizualization
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

# timer function
import time

## Reading the data

In [3]:
#Path to where the raw text files are stored within the session folders, i.e, converted sessions.
origin_path = "C:/DATA/convSes"

In [4]:
#A function for reading in the speeches
def read_text(file_path, file):
    
    '''Reading the text files'''
    
    with open(file_path, 'r', encoding='utf-8') as file:
        doc=file.read()
    return doc

In [5]:
#Bringing in all the speeches
doc_set = []
for i in range (0,50):
    year = 1970 + i
    session = "session " + str(25+i)+ " - "  + str(year)
    data_path = f"{origin_path}\\{session}"
    os.chdir(data_path)
    for file in os.listdir():
        if file[-4:]=='.txt':
            file_path = f"{data_path}\\{file}"
            doc_set.append({'Year': year, 'ISO_Code': file[:3] , 'text': read_text(file_path, file)})
        else:
            print(file)
            pass

word2vec.model


In [6]:
columns=['Year', 'ISO_Code', 'text']
dataset = pd.concat([pd.DataFrame([i], columns=columns) for i in tqdm(doc_set)], ignore_index=True)
dataset.head()

100%|██████████| 8288/8288 [00:04<00:00, 1837.03it/s]


Unnamed: 0,Year,ISO_Code,text
0,1970,ALB,33: May I first convey to our President the co...
1,1970,ARG,177.\t : It is a fortunate coincidence that pr...
2,1970,AUS,100.\t It is a pleasure for me to extend to y...
3,1970,AUT,155.\t May I begin by expressing to Ambassado...
4,1970,BEL,"176. No doubt each of us, before coming up to ..."


**Creating subset including only the G20 states**

In [7]:
# only select states belonging to the G20 group (minus south afrika and EU representatives)
g20 =  dataset.ISO_Code.isin(['CAN','FRA', 'DEU', 'USA', 'GBR', 'ITA', 'JPN','ARG', 'Aus', 'BRA', 'IND', 'IDN', 
                        'CAN', 'MEX', 'RUS', 'SAU', 'KOR', 'TUR', 'CHN'])
G20 = dataset[g20]

# reset ascending index for subset dataset
G20.reset_index(inplace = True, drop = True)
G20.head()

Unnamed: 0,Year,ISO_Code,text
0,1970,ARG,177.\t : It is a fortunate coincidence that pr...
1,1970,BRA,"1.\tMr. President, I should like, first of all..."
2,1970,CAN,The General Assembly is fortunate indeed to ha...
3,1970,FRA,"84.\t Within one month, when we celebrate the..."
4,1970,GBR,"110.\t Mr. President, I should like first to s..."


In [8]:
# testing a random sample of speeches
G20.sample(10)

Unnamed: 0,Year,ISO_Code,text
486,2000,GBR,I am conscious that\nthe central problem for a...
463,1999,ARG,We\nare pleased that an important figure in Na...
5,1970,IDN,"101.\tOn behalf of the Indonesian delegation, ..."
734,2015,IDN,Let me begin by congratulating Mr. Mogens Lykk...
514,2002,ARG,"﻿First, I wish to congratulate you, Mr. Presid..."
435,1997,FRA,"﻿May I first say, Sir, how pleased my country ..."
672,2011,JPN,I would like to begin \nby congratulating Mr. ...
485,2000,FRA,I have\nthe honour of speaking this year on be...
427,1996,SAU,"﻿It gives me pleasure, as we begin the work of..."
282,1988,BRA,"﻿\nMr. President, a tradition dating back to t..."


**Creating subsets for the 70s,80s,90s,2000s,2010s**

In [9]:
G20_70s = G20[(G20['Year'] >= 1970) & (G20['Year'] < 1980)] 
G20_80s = G20[(G20['Year'] >= 1980) & (G20['Year'] < 1990)]
G20_90s = G20[(G20['Year'] >= 1990) & (G20['Year'] < 2000)]
G20_2000s = G20[(G20['Year'] >= 2000) & (G20['Year'] < 2010)]
G20_2010s = G20[(G20['Year'] >= 2010) & (G20['Year'] < 2020)]

**Split speeches for every decade slice into paragraphs**

In [10]:
#70s

# Split speeches by delimiter \n and...
# ...stack a new row for each paragraph to dataframe... 
# ...with context info from corresponding speech.

temporary_file = G20_70s['text'].str.split('\.\s?\s?\s?\n', expand=True).stack().reset_index(level=0)
temporary_file.rename(columns={'level_0': 'speech_index', 0: 'text'}, inplace=True)
# temporary_file.reset_index().drop('index',1)

# merge context info from speeches with texts of paragraphs
G20_70s = G20_70s.drop('text', 1) # drop original text column
G20_70s = temporary_file.merge(G20_70s, right_index=True, left_on='speech_index', how='outer')

# reorder columns in new datset
G20_70s['paragraph_index'] = np.arange(len(G20_70s))
cols = G20_70s.columns.tolist()
cols = cols[-1:] + cols[:-1]
G20_70s = G20_70s[cols].set_index('paragraph_index')
G20_70s = G20_70s[1:]

In [11]:
#80s

# Split speeches by delimiter \n and...
# ...stack a new row for each paragraph to dataframe... 
# ...with context info from corresponding speech.

temporary_file = G20_80s['text'].str.split('\.\s?\s?\s?\n', expand=True).stack().reset_index(level=0)
temporary_file.rename(columns={'level_0': 'speech_index', 0: 'text'}, inplace=True)
# temporary_file.reset_index().drop('index',1)

# merge context info from speeches with texts of paragraphs
G20_80s = G20_80s.drop('text', 1) # drop original text column
G20_80s = temporary_file.merge(G20_80s, right_index=True, left_on='speech_index', how='outer')

# reorder columns in new datset
G20_80s['paragraph_index'] = np.arange(len(G20_80s))
cols = G20_80s.columns.tolist()
cols = cols[-1:] + cols[:-1]
G20_80s = G20_80s[cols].set_index('paragraph_index')
G20_80s = G20_80s[1:]

In [12]:
#90s

# Split speeches by delimiter \n and...
# ...stack a new row for each paragraph to dataframe... 
# ...with context info from corresponding speech.

temporary_file = G20_90s['text'].str.split('\.\s?\s?\s?\n', expand=True).stack().reset_index(level=0)
temporary_file.rename(columns={'level_0': 'speech_index', 0: 'text'}, inplace=True)
# temporary_file.reset_index().drop('index',1)

# merge context info from speeches with texts of paragraphs
G20_90s = G20_90s.drop('text', 1) # drop original text column
G20_90s = temporary_file.merge(G20_90s, right_index=True, left_on='speech_index', how='outer')

# reorder columns in new datset
G20_90s['paragraph_index'] = np.arange(len(G20_90s))
cols = G20_90s.columns.tolist()
cols = cols[-1:] + cols[:-1]
G20_90s = G20_90s[cols].set_index('paragraph_index')
G20_90s = G20_90s[1:]

In [13]:
#2000s

# Split speeches by delimiter \n and...
# ...stack a new row for each paragraph to dataframe... 
# ...with context info from corresponding speech.

temporary_file = G20_2000s['text'].str.split('\.\s?\s?\s?\n', expand=True).stack().reset_index(level=0)
temporary_file.rename(columns={'level_0': 'speech_index', 0: 'text'}, inplace=True)
# temporary_file.reset_index().drop('index',1)

# merge context info from speeches with texts of paragraphs
G20_2000s = G20_2000s.drop('text', 1) # drop original text column
G20_2000s = temporary_file.merge(G20_2000s, right_index=True, left_on='speech_index', how='outer')

# reorder columns in new datset
G20_2000s['paragraph_index'] = np.arange(len(G20_2000s))
cols = G20_2000s.columns.tolist()
cols = cols[-1:] + cols[:-1]
G20_2000s = G20_2000s[cols].set_index('paragraph_index')
G20_2000s = G20_2000s[1:]

In [14]:
#2010s

# Split speeches by delimiter \n and...
# ...stack a new row for each paragraph to dataframe... 
# ...with context info from corresponding speech.

temporary_file = G20_2010s['text'].str.split('\.\s?\s?\s?\n', expand=True).stack().reset_index(level=0)
temporary_file.rename(columns={'level_0': 'speech_index', 0: 'text'}, inplace=True)
# temporary_file.reset_index().drop('index',1)

# merge context info from speeches with texts of paragraphs
G20_2010s = G20_2010s.drop('text', 1) # drop original text column
G20_2010s = temporary_file.merge(G20_2010s, right_index=True, left_on='speech_index', how='outer')

# reorder columns in new datset
G20_2010s['paragraph_index'] = np.arange(len(G20_2010s))
cols = G20_2010s.columns.tolist()
cols = cols[-1:] + cols[:-1]
G20_2010s = G20_2010s[cols].set_index('paragraph_index')
G20_2010s = G20_2010s[1:]

## Pre-processing

**Defining/loading in the needed functions --> can we do this in a cleaner way??**

In [15]:
nlp = spacy.load('en_core_web_lg')#run in conda to download the library --> python -m download en_core_web_lg 

In [16]:
def init_proc(text, stop_words=[]):
    
    '''Pre-processing the input in single list'''
    
    stops = stopwords.words("english")
    stops.extend(stop_words)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\-', ' ', text)
    text = re.sub(r'\s\s+', ' ', text)
    text = re.sub(r'[0-9]+','', text)
    text = re.sub(r'[^\w\s]', '', text)
    lower = text.lower()
    doc = nlp(lower)
    words = []
    for token in doc:
        lemma = token.lemma_
        if lemma not in stops:
                words.append(lemma)
    return words

In [17]:
stop_words =['general', 'assembly', 'conference', 'session', 'congratulations', 
             'congratulate', 'secretarygeneral','members', 'member', 'united', 'nations', 
             'nation', 'statement', 'honour','every', 'sir', 'majesty', 'president', 
             'minister', 'prime', 'ambassador', 'thank', 'thanks', 'world', 'international', 
             'states', 'we', 'us', 'they', 'system', 'organization','say', 'think', 'know', 
             'want', 'need', 'let', 'ask', 'go', 'look', 'stand', 'open', 'give', 'see', 'come', 
             'make', 'made', 'meet','act', 'use', 'take', 'bring', 'ensure', 'able', 'assume', 
             'continue', 'change', 'progress', 'process', 'year', 'years', 'time', 'today',  
             'would', 'will', 'might', 'together', 'common', 'future', 'one', 'order', 'end', 
             'new', 'necessary', 'major', 'minor', 'many', 'people', 'peoples', 'appropriate', 
             'historic', 'adequate', 'best', 'better', 'confident', 'important', 'special',
             'great', 'therefore', 'thus', 'hence', 'like', 'particularly', 'many', 'much', 
             'greater', 'especially', 'towards', 'always', 'whether', 'around',
             'possible', 'clear', 'simply', 'must', 'also', 'however', 'mr',
             'united', 'kingdom', 'great', 'britain', 'france', 'germany','italy', 'japan',
             'canada', 'usa', 'argentina', 'australia', 'china', 'brazil', 'india', 'indonesia',  
             'mexico', 'russia', 'saudi', 'arabia', 'south', 'korea', 'turkey','liechtenstein',
             'I', ' ', '  ']

In [18]:
#loading the bigram models from the bigram_trigram_model_notebooks
#temp_file1 = datapath("C:/DATA/DTM/phrasers/bigram_speeches")
#temp_file2 = datapath("C:/DATA/DTM/phrasers/trigram_speeches")
from gensim.models.phrases import Phrases
bigram_phraser = Phrases.load("C:/DATA/DTM/phrasers/bigram_speeches")
trigam_phraser = Phrases.load("C:/DATA/DTM/phrasers/trigram_speeches")

In [19]:
#Function to preprocess speeches and also detect bi and trigrams
def pre_proc_comb(corpus,stop_words=[]):
    
    '''Looping the pre-processing over a list. Also checks if input is correct'''
    
    l = []
    if isinstance(corpus, str):
        l.append(init_proc(corpus,stop_words))
    elif all(isinstance(s, str) for s in corpus):    
        for item in tqdm(corpus):
            l.append(init_proc(item,stop_words))
    else:
        print("Error: This function only accepts strings or a list of strings.")
    bigram_token = []
    for j in l:
        bigram_token.append(bigram_phraser[j])
    trigram_token = []
    for i in bigram_token:
        trigram_token.append(trigam_phraser[i])
    return trigram_token

**Preprocessing the 5 dataslices**

In [20]:
#70s
text_corpus_70s = G20_70s['text'].values.tolist()
processed_corpus_70s = pre_proc_comb(text_corpus_70s,stop_words)

100%|██████████| 7730/7730 [02:27<00:00, 52.46it/s]


In [21]:
#80s
text_corpus_80s = G20_80s['text'].values.tolist()
processed_corpus_80s = pre_proc_comb(text_corpus_80s,stop_words)

100%|██████████| 8461/8461 [02:47<00:00, 50.60it/s]


In [22]:
#90s
text_corpus_90s = G20_90s['text'].values.tolist()
processed_corpus_90s = pre_proc_comb(text_corpus_90s,stop_words)

100%|██████████| 8020/8020 [02:02<00:00, 65.71it/s]


In [23]:
#2000s
text_corpus_2000s = G20_2000s['text'].values.tolist()
processed_corpus_2000s = pre_proc_comb(text_corpus_2000s,stop_words)

100%|██████████| 5680/5680 [01:18<00:00, 71.92it/s]


In [24]:
#2010s
text_corpus_2010s = G20_2010s['text'].values.tolist()
processed_corpus_2010s = pre_proc_comb(text_corpus_2010s,stop_words)

100%|██████████| 5208/5208 [01:28<00:00, 58.96it/s]


## Building the LDA models for every decade sice 

### 70s

In [25]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_70s = corpora.Dictionary(processed_corpus_70s)
doc_term_matrix_70s = [dictionary_70s.doc2bow(speech) for speech in processed_corpus_70s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_70s = LDA(corpus = doc_term_matrix_70s, id2word = dictionary_70s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 39min 59s


In [26]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/70s")
lda_model_70s.save(temp_file)

In [27]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/70s")
lda_model_70s = LDA.load(temp_file)

In [28]:
#investigating the word distribution over the different topics
#lda_model_70s.print_topics()

In [29]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [30]:
vis70s = gensimvis.prepare(lda_model_70s,doc_term_matrix_70s,dictionary_70s,mds= 'mmds', R=30)
vis70s

### 80s

In [31]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_80s = corpora.Dictionary(processed_corpus_80s)
doc_term_matrix_80s = [dictionary_80s.doc2bow(speech) for speech in processed_corpus_80s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_80s = LDA(corpus = doc_term_matrix_80s, id2word = dictionary_80s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 40min 48s


In [32]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/80s")
lda_model_80s.save(temp_file)

In [33]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/80s")
lda_model_80s = LDA.load(temp_file)

In [34]:
#investigating the word distribution over the different topics
#lda_model_80s.print_topics()

In [35]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [36]:
vis80s = gensimvis.prepare(lda_model_80s,doc_term_matrix_80s,dictionary_80s,mds= 'mmds', R=30)
vis80s

### 90s

In [37]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_90s = corpora.Dictionary(processed_corpus_90s)
doc_term_matrix_90s = [dictionary_90s.doc2bow(speech) for speech in processed_corpus_90s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_90s = LDA(corpus = doc_term_matrix_90s, id2word = dictionary_90s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 32min 1s


In [38]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/90s")
lda_model_90s.save(temp_file)

In [39]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/90s")
lda_model_90s = LDA.load(temp_file)

In [40]:
#investigating the word distribution over the different topics
#lda_model_90s.print_topics()

In [41]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [42]:
vis90s = gensimvis.prepare(lda_model_90s,doc_term_matrix_90s,dictionary_90s,mds= 'mmds', R=30)
vis90s

### 2000s

In [43]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_2000s = corpora.Dictionary(processed_corpus_2000s)
doc_term_matrix_2000s = [dictionary_2000s.doc2bow(speech) for speech in processed_corpus_2000s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_2000s = LDA(corpus = doc_term_matrix_2000s, id2word = dictionary_2000s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 30min 15s


In [44]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2000s")
lda_model_2000s.save(temp_file)

In [45]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2000s")
lda_model_2000s = LDA.load(temp_file)

In [46]:
#investigating the word distribution over the different topics
#lda_model_2000s.print_topics()

In [47]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [48]:
vis2000s = gensimvis.prepare(lda_model_2000s,doc_term_matrix_2000s,dictionary_2000s,mds= 'mmds', R=30)
vis2000s

### 2010s

In [49]:
%%time
# Placing the processed corpus in necessary format for LDA

dictionary_2010s = corpora.Dictionary(processed_corpus_2010s)
doc_term_matrix_2010s = [dictionary_2010s.doc2bow(speech) for speech in processed_corpus_2010s]

# Creating the LDA object 

LDA = gensim.models.ldamodel.LdaModel

#building the model
lda_model_2010s = LDA(corpus = doc_term_matrix_2010s, id2word = dictionary_2010s, num_topics = 10, random_state=100,
               chunksize= 1000, passes = 1000, iterations = 500)

Wall time: 26min 18s


In [50]:
#Saving the model locally 
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2010s")
lda_model_2010s.save(temp_file)

In [51]:
#Loading the model if it is already built
LDA = gensim.models.ldamodel.LdaModel
temp_file = datapath("C:/DATA/DTM/models/LDA_speeches/2010s")
lda_model_2010s = LDA.load(temp_file)

In [52]:
#investigating the word distribution over the different topics
#lda_model_2010s.print_topics()

In [53]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()

In [54]:
vis2010s = gensimvis.prepare(lda_model_2010s,doc_term_matrix_2010s,dictionary_2010s,mds= 'mmds', R=30)
vis2010s