In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import re
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [2]:
#peak into data
data = pd.read_csv('un-general-debates.csv')
data.head()

Unnamed: 0,session,year,country,text
0,44,1989,MDV,﻿It is indeed a pleasure for me and the member...
1,44,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,44,1989,NER,"﻿\nMr. President, it is a particular pleasure ..."
3,44,1989,URY,﻿\nDuring the debate at the fortieth session o...
4,44,1989,ZWE,﻿I should like at the outset to express my del...


In [3]:
#source of iso code database: https://unstats.un.org/unsd/methodology/m49/overview/
#data set represents country names with their 3-letter ISO-alpha
# To convert the ISO codes into country names, performed a left join
#dataset also includes region(continent) to enable filtering of African countries
#remove a duplicate column, "ISO-alpha3 Code"
data = pd.read_csv('un-general-debates.csv').drop('session', axis=1)
country_names = pd.read_excel ('UNSD — Methodology.xlsx')
data = pd.merge(data, country_names[['Region Name','Country or Area','ISO-alpha3 Code']],
             how='left', left_on='country', right_on='ISO-alpha3 Code')
data.drop('ISO-alpha3 Code',axis=1, inplace=True)
data.rename(columns = {'Country or Area': 'country_name'}, inplace = True)
data.head()



Unnamed: 0,year,country,text,Region Name,country_name
0,1989,MDV,﻿It is indeed a pleasure for me and the member...,Asia,Maldives
1,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ...",Europe,Finland
2,1989,NER,"﻿\nMr. President, it is a particular pleasure ...",Africa,Niger
3,1989,URY,﻿\nDuring the debate at the fortieth session o...,Americas,Uruguay
4,1989,ZWE,﻿I should like at the outset to express my del...,Africa,Zimbabwe


In [4]:
#Limiting country of interests to the  Africa continent
data = data.loc [data['Region Name']=='Africa']
data.nunique()

year              46
country           54
text            2159
Region Name        1
country_name      54
dtype: int64

In [5]:
def clean(s):    
    # Remove any tags:
    cleaned = re.sub(r"(?s)<.?>", " ", s)
    # Keep only regular chars:
    cleaned = re.sub(r"[^A-Za-z0-9(),*!?\'\`]", " ", cleaned)
    # Remove unicode chars
    cleaned = re.sub("\\\\u(.){4}", " ", cleaned)
    return cleaned.strip()

# clean text
data['text'] = data.text.apply(lambda x: clean(x))

In [6]:
# remove data with null value in year column
data = data[data['year'].notnull()]

# Group data by country and into 5 year periods
data['year'] = (data['year'] / 5).astype(int)*5
data = data.groupby(['country', 'year', 'country_name'])['text'].apply(list)
data = data.apply(lambda x: ''.join(x))
data = data.reset_index(drop=False)

data[:20]

Unnamed: 0,country,year,country_name,text
0,AGO,1975,Angola,1 On analysing the agenda of the thirty third...
1,AGO,1980,Angola,"A few days ago, we had the legitimate satisfac..."
2,AGO,1985,Angola,"Mr President, today I have the honour of addr..."
3,AGO,1990,Angola,Allow me first to congratulate Mr Shihabi on ...
4,AGO,1995,Angola,Allow me at the outset to congratulate Mr Ism...
5,AGO,2000,Angola,"Allow me, on behalf of my Government and in my..."
6,AGO,2005,Angola,I am particularly honoured to address the Gen...
7,AGO,2010,Angola,On behalf of the President of the Republic of...
8,AGO,2015,Angola,"At the outset, on behalf of the President of A..."
9,BDI,1970,Burundi,"1 Mr President, this great Assembly made a v..."


In [7]:
# Create 5000 TF-IDF features, using 3-gram
num_features = 5000
tfidf = TfidfVectorizer(max_features = num_features, strip_accents='unicode',
                        lowercase=True, stop_words='english', ngram_range=(1,3))
print('Fitting Data...')
tfidf.fit(data['text'].values.astype('U'))

print('Starting Transform...')
text_tfidf = tfidf.transform(data['text'])

print('Label and Incorporate TF-IDF...')
data_array = pd.DataFrame(text_tfidf.toarray())
feature_names = tfidf.get_feature_names()

for i in range(num_features):
    feature_names[i] = 'TF_' + feature_names[i]

data_array.columns = feature_names
data = pd.concat([data, data_array], axis=1)

data[:2]

Fitting Data...
Starting Transform...
Label and Incorporate TF-IDF...


Unnamed: 0,country,year,country_name,text,TF_000,TF_06,TF_09,TF_10,TF_10 years,TF_100,...,TF_yugoslavia,TF_zaire,TF_zambia,TF_zimbabwe,TF_zimbabwe namibia,TF_zionism,TF_zionist,TF_zone,TF_zone peace,TF_zones
0,AGO,1975,Angola,1 On analysing the agenda of the thirty third...,0.006471,0.0,0.0,0.004709,0.0,0.0,...,0.0,0.012208,0.017587,0.073176,0.012412,0.037237,0.0,0.013562,0.019426,0.0
1,AGO,1980,Angola,"A few days ago, we had the legitimate satisfac...",0.007096,0.0,0.0,0.008607,0.0,0.00278,...,0.0,0.0,0.006428,0.010699,0.0,0.013611,0.004159,0.004957,0.007101,0.0


In [8]:
# create list of TF-IDF features, confirm they make sense
features = data.columns.tolist()
for i in [ 'year', 'country_name','text']:
    features.remove(i)
features

['country',
 'TF_000',
 'TF_06',
 'TF_09',
 'TF_10',
 'TF_10 years',
 'TF_100',
 'TF_103',
 'TF_104',
 'TF_105',
 'TF_106',
 'TF_107',
 'TF_108',
 'TF_109',
 'TF_11',
 'TF_11 september',
 'TF_110',
 'TF_111',
 'TF_112',
 'TF_113',
 'TF_114',
 'TF_115',
 'TF_116',
 'TF_117',
 'TF_118',
 'TF_119',
 'TF_12',
 'TF_120',
 'TF_122',
 'TF_123',
 'TF_124',
 'TF_125',
 'TF_126',
 'TF_128',
 'TF_129',
 'TF_13',
 'TF_130',
 'TF_135',
 'TF_138',
 'TF_14',
 'TF_15',
 'TF_150',
 'TF_1514',
 'TF_1514 xv',
 'TF_16',
 'TF_17',
 'TF_174',
 'TF_18',
 'TF_181',
 'TF_19',
 'TF_1945',
 'TF_1960',
 'TF_1967',
 'TF_1967 338',
 'TF_1967 338 1973',
 'TF_1970',
 'TF_1971',
 'TF_1972',
 'TF_1973',
 'TF_1974',
 'TF_1975',
 'TF_1976',
 'TF_1977',
 'TF_1978',
 'TF_1979',
 'TF_1980',
 'TF_1980s',
 'TF_1981',
 'TF_1982',
 'TF_1983',
 'TF_1984',
 'TF_1985',
 'TF_1986',
 'TF_1987',
 'TF_1988',
 'TF_1989',
 'TF_1990',
 'TF_1990s',
 'TF_1991',
 'TF_1992',
 'TF_1993',
 'TF_1994',
 'TF_1995',
 'TF_1996',
 'TF_1997',
 'TF_19

In [12]:
def similarity(year, count=10, same_year=True):
    '''finds most similar and least similiar countries'''
    df = data.copy()
    df['total'] = df[features].sum(axis=1).abs()
    df = df.sort_values(by='total', ascending=True).reset_index(drop=True)
    if same_year == True:
        df = df[df['year'] == year]
        most_similar = df[1:1 + count].country_name 
        least_similar = df[-count:].country_name
    else:
        most_similar = df[1:1 + count].country_name + [' ' for i in range(count)] + ['(' + str(i) + ')' for i in df[1:1 + count].year]
        least_similar = df[-count:].country_name + [' ' for i in range(count)] + ['(' + str(i) + ')' for i in df[-count:].year]
    print(str(year))
    print('most similar:')
    print(most_similar.values)
    print()
    print('least similar:')
    print(least_similar.values)
    print()
    print()
    return ''
similarity(1995)

1995
most similar:
['Sao Tome and Principe' 'Rwanda' 'Eritrea' 'Equatorial Guinea' 'Malawi'
 'Central African Republic' 'Burundi' 'Guinea-Bissau' 'Ethiopia'
 'Sierra Leone']

least similar:
['Namibia' 'Cabo Verde' 'Mauritania' 'United Republic of Tanzania'
 'Algeria' 'Senegal' 'Egypt' 'Ghana' 'Djibouti' 'Mozambique']




''

In [13]:
similarity(2005)

2005
most similar:
['Burkina Faso' 'Sierra Leone' 'Somalia' 'Eswatini' 'Tunisia' 'Ethiopia'
 'Eritrea' 'Cabo Verde' 'Seychelles' 'Djibouti']

least similar:
['Lesotho' 'Zimbabwe' 'Congo' 'Botswana' 'Benin' 'Mozambique' 'Gambia'
 'Algeria' 'Angola' 'Guinea']




''