# Constructing a bigram and trigram model (per paragraph)

In [2]:
## Required libraries

#System
import os
from tqdm import tqdm 

#Data structure manipulation
import pandas as pd
import numpy as np

#text cleaning 
import re
import string

#nlp
import nltk
from nltk.corpus import stopwords
import spacy

# unlist nested lists
import itertools
from itertools import chain

# count word frequencies
#from collections import defaultdict

# gensim
import gensim

import warnings
warnings.filterwarnings('ignore')

## Reading in the Data 

In [4]:
#Path to where the raw text files are stored within the session folders, i.e, converted sessions.
origin_path = "C:/DATA/convSes"

In [5]:
#A function for reading in the speeches
def read_text(file_path, file):
    
    '''Reading the text files'''
    
    with open(file_path, 'r', encoding='utf-8') as file:
        doc=file.read()
    return doc

In [6]:
#Bringing in all the speeches
doc_set = []
for i in range (0,50):
    year = 1970 + i
    session = "session " + str(25+i)+ " - "  + str(year)
    data_path = f"{origin_path}\\{session}"
    os.chdir(data_path)
    for file in os.listdir():
        if file[-4:]=='.txt':
            file_path = f"{data_path}\\{file}"
            doc_set.append({'Year': year, 'ISO_Code': file[:3] , 'text': read_text(file_path, file)})
        else:
            print(file)
            pass

word2vec.model


In [7]:
columns=['Year', 'ISO_Code', 'text']
dataset = pd.concat([pd.DataFrame([i], columns=columns) for i in tqdm(doc_set)], ignore_index=True)
dataset.head()

100%|██████████| 8288/8288 [00:04<00:00, 1864.57it/s]


Unnamed: 0,Year,ISO_Code,text
0,1970,ALB,33: May I first convey to our President the co...
1,1970,ARG,177.\t : It is a fortunate coincidence that pr...
2,1970,AUS,100.\t It is a pleasure for me to extend to y...
3,1970,AUT,155.\t May I begin by expressing to Ambassado...
4,1970,BEL,"176. No doubt each of us, before coming up to ..."


**Creating a Subet including only G20 states**

In [9]:
# only select states belonging to the G20 group (minus south afrika and EU representatives)
g20 =  dataset.ISO_Code.isin(['CAN','FRA', 'DEU', 'USA', 'GBR', 'ITA', 'JPN','ARG', 'Aus', 'BRA', 'IND', 'IDN', 
                        'CAN', 'MEX', 'RUS', 'SAU', 'KOR', 'TUR', 'CHN'])
G20 = dataset[g20]

# reset ascending index for subset dataset
G20.reset_index(inplace = True, drop = True)
G20.head()

Unnamed: 0,Year,ISO_Code,text
0,1970,ARG,177.\t : It is a fortunate coincidence that pr...
1,1970,BRA,"1.\tMr. President, I should like, first of all..."
2,1970,CAN,The General Assembly is fortunate indeed to ha...
3,1970,FRA,"84.\t Within one month, when we celebrate the..."
4,1970,GBR,"110.\t Mr. President, I should like first to s..."


**Split Speeches Into Seperate Paragraphs** 

In [10]:
# Split speeches by delimiter \n and...
# ...stack a new row for each paragraph to dataframe... 
# ...with context info from corresponding speech.

temporary_file = G20['text'].str.split('\.\s?\s?\s?\n', expand=True).stack().reset_index(level=0)
temporary_file.rename(columns={'level_0': 'speech_index', 0: 'text'}, inplace=True)
# temporary_file.reset_index().drop('index',1)

# merge context info from speeches with texts of paragraphs
G20 = G20.drop('text', 1) # drop original text column
G20 = temporary_file.merge(G20, right_index=True, left_on='speech_index', how='outer')

# reorder columns in new datset
G20['paragraph_index'] = np.arange(len(G20))
cols = G20.columns.tolist()
cols = cols[-1:] + cols[:-1]
G20 = G20[cols].set_index('paragraph_index')
G20 = G20[1:]

In [11]:
# testing a random sample of paragraphs
G20.sample(10)
# some paragraphs might be empty, this is dealth with below in the preprocessor

Unnamed: 0_level_0,speech_index,text,Year,ISO_Code
paragraph_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7237,145,32.\tThis Assembly is meeting again after one ...,1979,IND
22826,447,The fact that South America is a region in whi...,1998,BRA
23664,466,Cooperation should be strengthened on the basi...,1999,CHN
6865,137,294.\tFunctioning indisputably as the underlyi...,1979,ARG
16873,326,When the sudden aggression of the Iraqi regime...,1990,SAU
14722,281,As part of any global strategy it is essential...,1988,ARG
10445,202,44.\tIf the present crisis is to have a renova...,1983,BRA
19632,380,United Nations action must enjoy the resolute\...,1994,ARG
15763,302,Many developing countries do not always have s...,1989,FRA
21502,417,We are contributing 312 million deutsche mark ...,1996,DEU


In [12]:
G20['text'].shape

(35103,)

## Pre-Processing

In [14]:
text_corpus = G20['text'].values.tolist()

In [15]:
nlp = spacy.load('en_core_web_lg')#run in conda to download the library --> python -m download en_core_web_lg 

In [16]:
#Function to preprocess elements for a single list, e.g. one speech or one paragraph
def init_proc(text, stop_words=[]):
    
    '''Pre-processing the input in single list'''
    
    stops = stopwords.words("english")
    stops.extend(stop_words)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\-', ' ', text)
    text = re.sub(r'\s\s+', ' ', text)
    text = re.sub(r'[0-9]+','', text)
    text = re.sub(r'[^\w\s]', '', text)
    lower = text.lower()
    doc = nlp(lower)
    words = []
    for token in doc:
        lemma = token.lemma_
        if lemma not in stops:
                words.append(lemma)
    return words

In [17]:
#function that loops the init_proc over a list, e.g. a list of speeches, or a list of paragraphs
def pre_proc_comb(corpus,stop_words=[]):
    
    '''Looping the pre-processing over a list. Also checks if input is correct'''
    
    l = []
    if isinstance(corpus, str):
        l.append(init_proc(corpus,stop_words))
    elif all(isinstance(s, str) for s in corpus):    
        for item in tqdm(corpus):
            l.append(init_proc(item,stop_words))
    else:
        print("Error: This function only accepts strings or a list of strings.")
    return l

In [18]:
#user defined stopwords on top of nltk default stopwords
stop_words =['general', 'assembly', 'conference', 'session', 'congratulations', 
             'congratulate', 'secretarygeneral','members', 'member', 'united', 'nations', 
             'nation', 'statement', 'honour','every', 'sir', 'majesty', 'president', 
             'minister', 'prime', 'ambassador', 'thank', 'thanks', 'world', 'international', 
             'states', 'we', 'us', 'they', 'system', 'organization','say', 'think', 'know', 
             'want', 'need', 'let', 'ask', 'go', 'look', 'stand', 'open', 'give', 'see', 'come', 
             'make', 'made', 'meet','act', 'use', 'take', 'bring', 'ensure', 'able', 'assume', 
             'continue', 'change', 'progress', 'process', 'year', 'years', 'time', 'today',  
             'would', 'will', 'might', 'together', 'common', 'future', 'one', 'order', 'end', 
             'new', 'necessary', 'major', 'minor', 'many', 'people', 'peoples', 'appropriate', 
             'historic', 'adequate', 'best', 'better', 'confident', 'important', 'special',
             'great', 'therefore', 'thus', 'hence', 'like', 'particularly', 'many', 'much', 
             'greater', 'especially', 'towards', 'always', 'whether', 'around',
             'possible', 'clear', 'simply', 'must', 'also', 'however', 'mr',
             'united', 'kingdom', 'great', 'britain', 'france', 'germany','italy', 'japan',
             'canada', 'usa', 'argentina', 'australia', 'china', 'brazil', 'india', 'indonesia',  
             'mexico', 'russia', 'saudi', 'arabia', 'south', 'korea', 'turkey','liechtenstein',
             'I', ' ', '  ']

In [19]:
# preprocessing the whole corpus of speeches (per paragraph) 

In [20]:
processed_corpus = pre_proc_comb(text_corpus, stop_words)

100%|██████████| 35103/35103 [09:55<00:00, 58.93it/s]


In [21]:
bigram_phrases = gensim.models.Phrases(processed_corpus,
                                       min_count = 5,#Ignore all words and bigrams with total collected count lower than this value
                                       threshold = 100)#The minimum score for a bigram to be taken into account.
trigram_phrases = gensim.models.Phrases(bigram_phrases[processed_corpus],threshold = 100)
bigram = gensim.models.phrases.Phraser = (bigram_phrases)
trigram = gensim.models.phrases.Phraser = (trigram_phrases)

bigram.save("C:/DATA/DTM/phrasers/bigram_paragraphs")
trigram.save("C:/DATA/DTM/phrasers/trigram_paragraphs")