## Data Combination and Data Cleaning

**Speech Dataset**

In [2]:
# Load speech dataframe
import os
import numpy as np
import pandas as pd

sessions = np.arange(25, 76)
data=[]

for session in sessions:
    directory = "./TXT/Session "+str(session)+" - "+str(1945+session)
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename), encoding="utf8")
        if filename[0]==".": #ignore hidden files
            continue
        splt = filename.split("_")
        data.append([session, 1945+session, splt[0], f.read()])

        
df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])

**Country-Name Dataset**

In [3]:
# Load UNSD dataframe(basically is the country-name dataset)
n = 16 #define the columns

# Load all the data using lineterminator = '\n' to get all the  
# columns that are misplaced because of the ',' inside them
unsd_df = pd.read_csv('UNSD — Methodology.csv', usecols=range(n), lineterminator='\n')
unsd_df

Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS),Developed / Developing Countries\r
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,012,DZ,DZA,,,,Developing\r
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,,Developing\r
2,1,World,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY,,,,Developing\r
3,1,World,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR,,,,Developing\r
4,1,World,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN,x,,,Developing\r
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,1,World,9.0,Oceania,61.0,Polynesia,,,Samoa,882,WS,WSM,,,x,Developing\r
245,1,World,9.0,Oceania,61.0,Polynesia,,,Tokelau,772,TK,TKL,,,,Developing\r
246,1,World,9.0,Oceania,61.0,Polynesia,,,Tonga,776,TO,TON,,,x,Developing\r
247,1,World,9.0,Oceania,61.0,Polynesia,,,Tuvalu,798,TV,TUV,x,,x,Developing\r


In [4]:
# Rename last column - remove the space(\s)
unsd_df.rename(columns={'Developed / Developing Countries\r': 'Developed / Developing Countries'}, inplace=True)

In [5]:
# Complete/Combine the name of the "Country or Area" that was misplaced into  
# the M49 Code column and adjust all other columns

for i,j in unsd_df["M49 Code"].items():
    if(len(j)>3):
        unsd_df.iloc[i, unsd_df.columns.get_loc('Country or Area')] += j
        unsd_df.iloc[i, unsd_df.columns.get_loc('M49 Code')] = unsd_df.iloc[i, unsd_df.columns.get_loc('ISO-alpha2 Code')]
        unsd_df.iloc[i, unsd_df.columns.get_loc('ISO-alpha2 Code')] = unsd_df.iloc[i, unsd_df.columns.get_loc('ISO-alpha3 Code')]
        unsd_df.iloc[i, unsd_df.columns.get_loc('ISO-alpha3 Code')] = unsd_df.iloc[i, unsd_df.columns.get_loc('Least Developed Countries (LDC)')]
        unsd_df.iloc[i, unsd_df.columns.get_loc('Least Developed Countries (LDC)')] = unsd_df.iloc[i, unsd_df.columns.get_loc('Land Locked Developing Countries (LLDC)')]
        unsd_df.iloc[i, unsd_df.columns.get_loc('Land Locked Developing Countries (LLDC)')] = unsd_df.iloc[i, unsd_df.columns.get_loc('Small Island Developing States (SIDS)')]        
        unsd_df.iloc[i, unsd_df.columns.get_loc('Small Island Developing States (SIDS)')] = unsd_df.iloc[i, unsd_df.columns.get_loc('Developed / Developing Countries')]
        unsd_df.iloc[i, unsd_df.columns.get_loc('Developed / Developing Countries')] = "Developing"

In [6]:
# Merge Speech and Country-Name dataframe
speech_and_unsd_df = pd.merge(unsd_df, df_speech, on="ISO-alpha3 Code")

# Select specific columns to the final Speech and Country-Name dataframe
speech_and_countryName_df = speech_and_unsd_df[['Region Name', 'Country or Area', 'Session', 'Year', 'Speech']].copy()
speech_and_countryName_df

Unnamed: 0,Region Name,Country or Area,Session,Year,Speech
0,Africa,Algeria,25,1970,1. The delegation of Algeria is very pleased ...
1,Africa,Algeria,26,1971,\n154.\t : It is not only in order to keep up ...
2,Africa,Algeria,27,1972,"Mr. President, in electing you to preside over..."
3,Africa,Algeria,28,1973,"﻿121.\tMr. President, since a tradition appear..."
4,Africa,Algeria,29,1974,"Mr. President, it would be ungracious of the r..."
...,...,...,...,...,...
8379,Oceania,Tuvalu,71,2016,On behalf of the Government and people of Tuva...
8380,Oceania,Tuvalu,72,2017,"Next week, on 1 October, Tuvalu will mark the ..."
8381,Oceania,Tuvalu,73,2018,"It gives me great pleasure, on behalf of the G..."
8382,Oceania,Tuvalu,74,2019,"On behalf of Tuvalu and on my own behalf, I co..."


**Happiness Dataset**

In [7]:
# Load hapiness dataframe
happinessdataframe = pd.read_excel('DataPanelWHR2021C2.xls', index_col=[0,1])
happinessdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
Country name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,2008,3.723590,7.370100,0.450662,50.799999,0.718114,0.167640,0.881686,0.517637,0.258195
Afghanistan,2009,4.401778,7.539972,0.552308,51.200001,0.678896,0.190099,0.850035,0.583926,0.237092
Afghanistan,2010,4.758381,7.646709,0.539075,51.599998,0.600127,0.120590,0.706766,0.618265,0.275324
Afghanistan,2011,3.831719,7.619532,0.521104,51.919998,0.495901,0.162427,0.731109,0.611387,0.267175
Afghanistan,2012,3.782938,7.705479,0.520637,52.240002,0.530935,0.236032,0.775620,0.710385,0.267919
...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2016,3.735400,7.984372,0.768425,54.400002,0.732971,-0.094634,0.723612,0.737636,0.208555
Zimbabwe,2017,3.638300,8.015738,0.754147,55.000000,0.752826,-0.097645,0.751208,0.806428,0.224051
Zimbabwe,2018,3.616480,8.048798,0.775388,55.599998,0.762675,-0.068427,0.844209,0.710119,0.211726
Zimbabwe,2019,2.693523,7.950132,0.759162,56.200001,0.631908,-0.063791,0.830652,0.716004,0.235354


In [8]:
# happinessdataframe rename index from 'Country name' to'Country or Area'
happinessdataframe.index.names = ['Country or Area', 'Year']

In [9]:
# Merge Speech-Country-Name dataframe with the Happiness dataframe 
# on the columns 'Country or Area' and 'Year' 
all_data_df = pd.merge(speech_and_countryName_df, happinessdataframe, left_on=['Country or Area','Year'], right_on=['Country or Area','Year'], right_index=True)

# Create indexes on the columns 'Country or Area' and 'Year' 
all_data_df = all_data_df.set_index(['Country or Area','Year'])

# Create two dataframes one for speeches tokenized, and one for speeches tokennized and FreqDist
all_data_tokenized_df = all_data_df.copy()
all_data_tokenized_FreqDist_df = all_data_df.copy()
data_word_vector_df = all_data_df[["Speech"]].copy()

# This is the unmerged Speech dataset
data_word_vector_df_unmerged = speech_and_countryName_df
data_word_vector_df_unmerged =data_word_vector_df_unmerged.set_index(["Country or Area","Year"])



Download (in case you haven't already done so)

In [None]:
# import nltk

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('vader_lexicon')

**OPTION 1) Run this if you want a dataframe merged with happiness**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.probability import FreqDist
import string

# Create all punctuation string variable
punct = '!"#$%&\'()*+0123456789,’-—./:;<=>?@[\\]^_`{}~[\n]'
# Create a mapping table that will have as key the punctuation and as value an empty string
transtab = str.maketrans(dict.fromkeys(punct, ''))

# Loop through all the cells of "Speech" column
for county_year_index,cell in data_word_vector_df["Speech"].items():
    # Remove all punctuations and convert the text to lowercase
    words = word_tokenize(cell.translate(transtab).lower())
    # Create an array that has all the words that don't give information
    notuseful_words = stopwords.words("english")
    # Create and fill an empty array to gather all the important words of every "Speech" cell
    useful_words = []
    for w in words:
        if (w not in notuseful_words) and (len(w) > 2):
            useful_words.append(w)
    # Fill the dataframe with the text of "Speech" for each cell
    data_word_vector_df["Speech"][county_year_index] = ' '.join(useful_words)


**OPTION 2) You can run this instead of the above if you want to make a dataframe with word count from the unmerged example, it will take some minutes to finish around 2-4**

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.probability import FreqDist
import string

# Create all punctuation string variable
punct = '!"#$%&\'()*+0123456789,’-—./:;<=>?@[\\]^_`{}~[\n]'
# Create a mapping table that will have as key the punctuation and as value an empty string
transtab = str.maketrans(dict.fromkeys(punct, ''))

# Loop through all the cells of "Speech" column
for county_year_index,cell in data_word_vector_df_unmerged["Speech"].items():
    # Remove all punctuations and convert the text to lowercase
    words = word_tokenize(cell.translate(transtab).lower())
    # Create an array that has all the words that don't give information
    notuseful_words = stopwords.words("english")
    # Create and fill an empty array to gather all the important words of every "Speech" cell
    useful_words = []
    for w in words:
        if (w not in notuseful_words) and (len(w) > 2):
            useful_words.append(w)
    # Fill the dataframe with the text of "Speech" for each cell
    data_word_vector_df_unmerged["Speech"][county_year_index] = ' '.join(useful_words)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_word_vector_df_unmerged["Speech"][county_year_index] = ' '.join(useful_words)


In [11]:
from sklearn.feature_extraction.text import CountVectorizer


count_vect = CountVectorizer()

word_count_df = count_vect.fit_transform(data_word_vector_df["Speech"])
speechOnlyDf = pd.DataFrame(word_count_df.toarray() ,columns= count_vect.get_feature_names())

word_count_df_unmerged = count_vect.fit_transform(data_word_vector_df_unmerged["Speech"])
speechOnlyDfUnmerged = pd.DataFrame.sparse.from_spmatrix(word_count_df_unmerged,columns=count_vect.get_feature_names())


**This is used to create a happines and speech dataframe. It creates a dataframe with the whole speech merged with happiness**\
**You NEED to run this if you want the following cells to play |OR| You can skip some cells below, there are comments to find it**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.probability import FreqDist
import string

# Create all punctuation string variable
punct = '!"#$%&\'()*+0123456789,’-—./:;<=>?@[\\]^_`{}~[\n]'
# Create a mapping table that will have as key the punctuation and as value an empty string
transtab = str.maketrans(dict.fromkeys(punct, ''))

# Loop through all the cells of "Speech" column
for i,j in all_data_df["Speech"].items():
    # Remove all punctuations and convert the text to lowercase
    words = word_tokenize(j.translate(transtab).lower())
    # Create an array that has all the words that don't give information
    sw = stopwords.words("english")
    # Create and fill an empty array to gather all the important words of every "Speech" cell
    no_sw = []
    for w in words:
        if (w not in sw) and (len(w) > 2):
            no_sw.append(w)
    # Fill the dataframe with the tokenized "Speech" for each cell
    all_data_tokenized_df["Speech"][i] = no_sw
    # Fill the dataframe with the word-count of the tokenized "Speech" for each cell
    all_data_tokenized_FreqDist_df["Speech"][i] = FreqDist(no_sw)

Just some 1 visualization for better understanding and some useful keywords

In [None]:
all_data_tokenized_FreqDist_df["Speech"][1].plot(20)

**Data Cleaning**

In [None]:
# Observe the mean values for each numerical column
all_data_tokenized_FreqDist_df.describe()


In [None]:
# Count how many NaN values we have per column
all_data_tokenized_FreqDist_df.isnull().sum()


In [None]:
# Here is depicted that the null values are all float64 type 
all_data_tokenized_FreqDist_df.dtypes

Keep one of the two approaches !!!

In [None]:
# Approach 2

# Remove all NaN values
all_data_tokenized_FreqDist_df =all_data_tokenized_FreqDist_df.dropna()

all_data_tokenized_df =all_data_tokenized_df.dropna()

In [None]:
# The only column that we should consider if it worthy to remove duplicates is "Session"

# Food for thought
# It is possible that there are two sessions rows with the same session for two different countries

# all_data_tokenized_FreqDist_mean_df = all_data_tokenized_FreqDist_mean_df.drop_duplicates(subset=['Session'])
# len(all_data_tokenized_df)

In [None]:
# Print the available values in column "Session"
all_data_tokenized_FreqDist_df['Session'].unique()

Removing Outliers

In [None]:
from scipy import stats

# A basic way to remove outliers with Z-score
# Reference : https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame

# I am not sure if we should remove the outliers ????

# I do not think we should remove any outliers

numeric_df = all_data_tokenized_FreqDist_df[['Life Ladder', 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect']].copy()
all_data_tokenized_FreqDist_outliers_df = all_data_tokenized_FreqDist_df[(np.abs(stats.zscore(numeric_df)) < 3).all(axis=1)]


**----SKIP HERE IF YOU DIDN'T RUN THE CELL MENTIONED ABOVE----**

In [12]:
#Join to create a very nice and handy dataframe of all words with index the country and year

countryYearWordsUnmerged = pd.DataFrame()
countryYearWordsUnmerged = speech_and_countryName_df.join(speechOnlyDfUnmerged)
countryClassifierWordsUnmerged = speech_and_countryName_df.join(speechOnlyDfUnmerged)


In [13]:
# Drop the speech column cause it contains all the info we dont need anymore
countryYearWordsUnmerged= countryYearWordsUnmerged.drop(["Speech"], axis = 1)
countryClassifierWordsUnmerged= countryClassifierWordsUnmerged.drop(["Speech"], axis = 1)


In [13]:
# Index by country and year

countryYearWordsUnmerged.set_index(["Country or Area", "Year"],inplace=True)
countryYearWordsUnmerged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Region Name,Session,aaa,aac,aachen,aacknowledged,aacrev,aadd,aadda,aaddi,...,сөйлемек,тhomson,хxi,шмс,шоп,шьа,ьол,қарекет,қылмақ,ﬂagrant
Country or Area,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Algeria,1970,Africa,25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Algeria,1971,Africa,26,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Algeria,1972,Africa,27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Algeria,1973,Africa,28,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Algeria,1974,Africa,29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Drop Session which is useless and reset the indexes

In [34]:

countryYearWordsUnmerged =  countryYearWordsUnmerged.drop(["Session"], axis= 1)
countryYearWordsUnmerged.reset_index(drop=True,inplace=True)
countryYearWordsUnmerged.head()

Unnamed: 0,Region Name,aaa,aac,aachen,aacknowledged,aacrev,aadd,aadda,aaddi,aaddj,...,сөйлемек,тhomson,хxi,шмс,шоп,шьа,ьол,қарекет,қылмақ,ﬂagrant
0,Africa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Africa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Africa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Africa,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Africa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Turn Africa in 1 and other regions to 0, so we can see if we can predict African speeches

In [35]:
replace_values = { "Africa" : 1, "Americas" : 0,"Europe" : 0, "Asia" : 0, "Oceania" : 0}

countryYearWordsUnmerged.replace({"Region Name" : replace_values}, inplace=True)
countryYearWordsUnmerged.tail()

Unnamed: 0,Region Name,aaa,aac,aachen,aacknowledged,aacrev,aadd,aadda,aaddi,aaddj,...,сөйлемек,тhomson,хxi,шмс,шоп,шьа,ьол,қарекет,қылмақ,ﬂagrant
8379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Create the target, which is our region classification, and the data, which is our words**

In [36]:
target= countryYearWordsUnmerged["Region Name"]
inputs = countryYearWordsUnmerged.drop("Region Name", axis="columns")

Make a split to our data set


In [42]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)


Info about our test and train sets

In [43]:
len(x_test)

839

Import MN Naive Bayes and fit our model

In [53]:
from sklearn.naive_bayes import MultinomialNB

model= MultinomialNB(alpha= 0.01, fit_prior=False)
model.fit(x_train,y_train)

MultinomialNB(alpha=0.01, fit_prior=False)

Our score


In [54]:
model.score(x_test,y_test)

0.9570917759237187

Hyperparameter Tuning with GridSearch



In [51]:
## DO NOT RUN THIS, IT TAKES 6 HOURS
from sklearn.model_selection import GridSearchCV

parameters = {
    'alpha': (1, 1e-1, 1e-2, 1e-3),
    'fit_prior': (True, False)
}
grid_model = GridSearchCV(estimator= model, param_grid= parameters, cv=5);
grid_model = grid_model.fit(x_train,y_train)

Best possible parameters and score for our model

In [52]:
print(grid_model.best_score_)
print(grid_model.best_params_)

0.9610337972166997
{'alpha': 0.01, 'fit_prior': False}


------------ComplementNB implementation for country. This is the implementation for **Country Classification**------

In [64]:
#Find the mode of countries

countries = countryClassifierWordsUnmerged.loc[:,"Country or Area"].values
countries = np.unique(countries)
countriesOccuranceMode = countryClassifierWordsUnmerged["Country or Area"].value_counts().mode
countriesOccuranceMode

<bound method Series.mode of Congo                         51
Belarus                       51
Iran (Islamic Republic of)    51
Iceland                       51
Indonesia                     51
                              ..
Kiribati                      18
Holy See                      18
Montenegro                    15
Serbia                        15
South Sudan                    9
Name: Country or Area, Length: 195, dtype: int64>

In [97]:
# Place the mode to filter countries that only fit that mode
countriesUnique = countryClassifierWordsUnmerged["Country or Area"].value_counts()
countriesThatFitMode = countriesUnique[countriesUnique == 51] #The 51 is the mode we found earlier
#Get a random country from the list
randCountry = countriesThatFitMode.sample(n=1, random_state= 10)
#Get country name
randCountry = randCountry.index[0]
#Add countries in a dictionary 
replaceDictionaryForCountries = {}
for country in countries:
  if country == randCountry:
    replaceDictionaryForCountries[country] = 1
  else:
    replaceDictionaryForCountries[country] = 0
# Split the set into target and input
countryClassifierWordsUnmerged.replace({"Country or Area" : replaceDictionaryForCountries}, inplace=True)
classTargets = countryClassifierWordsUnmerged["Country or Area"]

classInputs = countryClassifierWordsUnmerged.drop(["Region Name", "Session", "Year","Country or Area"], axis=1)




0       0
1       0
2       0
3       0
4       0
       ..
8379    0
8380    0
8381    0
8382    0
8383    0
Name: Country or Area, Length: 8384, dtype: int64
      aaa  aac  aachen  aacknowledged  aacrev  aadd  aadda  aaddi  aaddj  \
0       0    0       0              0       0     0      0      0      0   
1       0    0       0              0       0     0      0      0      0   
2       0    0       0              0       0     0      0      0      0   
3       0    0       0              0       0     1      0      0      0   
4       0    0       0              0       0     0      0      0      0   
...   ...  ...     ...            ...     ...   ...    ...    ...    ...   
8379    0    0       0              0       0     0      0      0      0   
8380    0    0       0              0       0     0      0      0      0   
8381    0    0       0              0       0     0      0      0      0   
8382    0    0       0              0       0     0      0      0      0   
8383

Split the countries set into data set and train set


In [105]:
from sklearn.model_selection import train_test_split

x_words_train, x_words_test, y_country_train, y_country_test = train_test_split(classInputs, classTargets, test_size=0.3)

Implement the Complement model


In [106]:
from sklearn.naive_bayes import ComplementNB

complementNBModel = ComplementNB(alpha = 0.01)
complementNBModel.fit(x_words_train,y_country_train)

ComplementNB(alpha=0.01)

In [107]:
complementNBModel.score(x_words_test,y_country_test)

0.9904610492845787

Implement the Multinomial model

In [108]:
from sklearn.naive_bayes import MultinomialNB

multinomialNBModel= MultinomialNB(alpha= 0.01)
multinomialNBModel.fit(x_words_train,y_country_train)

MultinomialNB(alpha=0.01)

In [109]:
multinomialNBModel.score(x_words_test,y_country_test)

0.9912559618441972