In [None]:
import os
import numpy as np
import pandas as pd
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

sessions = np.arange(25, 76)
data=[]

for session in sessions:
    directory = "./TXT/Session "+str(session)+" - "+str(1945+session)
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename), encoding='utf-8')
        if filename[0]==".": #ignore hidden files
            continue
        splt = filename.split("_")
        data.append([session, 1945+session, splt[0], f.read()])

        
df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])


In [None]:
df_codes = pd.read_csv('UNSD — Methodology.csv')

In [None]:
df_un_merged=df_codes.merge(df_speech)
df_un_merged=df_un_merged[["Country or Area", "Region Name","Sub-region Name", "ISO-alpha3 Code","Developed / Developing Countries", "Session", "Year", "Speech"]]
df_un_merged=df_un_merged.set_index(["Year", "ISO-alpha3 Code"])
df_un_merged

In [None]:
from nltk.probability import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords

def preprocess(words):
    sw = stopwords.words("english")
    no_sw = []
    for w in words:
        if (w not in sw) and w.isalpha() == True:
            no_sw.append(w.lower())
    return no_sw


def word_occureces(word):
    occ=df['Speech'].map(lambda text: text.count(word)).sum()
    return occ

def word_occureces_country_year(word, country, year):
    text = df_un_merged.loc[year,country]["Speech"]
    words = word_tokenize(text)
    words = preprocess(words).count(word)
    return words

def build_dict(word):
#word_occureces(word)
    occ_dict = {}
    for year in range(1970, 2021):
        occ_dict[year] = []

    for tup in df.index:
        occ_dict[tup[0]].append({tup[1]: word_occureces_country_year(word,tup[1],tup[0])})
    print(occ_dict)
    return occ_dict

df = df_un_merged
refugee_occ = build_dict("refugee")
emigrant_occ = build_dict('emigrant')
migration_occ = build_dict('migrantion')
migrant_occ = build_dict('migrant')
immigration_occ = build_dict('immigration')
immigrant_occ = build_dict('immigrant')
    

In [None]:
total_occ = {}
total_year = {}
i=0

for year in range(1970, 2021):
    g = np.array([list((list(d.values())[0]) for d in refugee_occ[year]), list((list(d.values())[0]) for d in emigrant_occ[year]),
              list((list(d.values())[0]) for d in migration_occ[year]), list((list(d.values())[0]) for d in migrant_occ[year]), 
              list((list(d.values())[0]) for d in immigration_occ[year]), list((list(d.values())[0]) for d in immigrant_occ[year])]).sum(axis=0)
    total_occ[year] = []
    for i in range(len(g)):

        total_occ[year].append({'Country':list(refugee_occ[year][i].keys())[0], 'Occurence': g[i]})


In [None]:

y=[year for year in range(1970, 2021)]
x1=[]
x2=[]
x3=[]
x4=[]
x5=[]
x6=[]
x_tot=[]
total=0
for year in y:
    
    for d in refugee_occ[year]:
        for key, value in d.items():
            total += value
    x1.append(total)
    total = 0
    
    for d in emigrant_occ[year]:
        for key, value in d.items():
            total += value
    x2.append(total)
    total = 0
    
    for d in migration_occ[year]:
        for key, value in d.items():
            total += value
    x3.append(total)
    total = 0
    
    for d in migrant_occ[year]:
        for key, value in d.items():
            total += value
    x4.append(total)
    total = 0
    
    for d in immigration_occ[year]:
        for key, value in d.items():
            total += value
    x5.append(total)
    total = 0
    
    for d in immigrant_occ[year]:
        for key, value in d.items():
            total += value
    x6.append(total)
    total = 0
    
    x_tot.append(x1[year-1970]+x2[year-1970]+x3[year-1970]+x4[year-1970]+x5[year-1970]+x6[year-1970])



In [None]:
fig, ax = plt.subplots()
ax.plot(y, x_1 label = "refugee")
ax.plot(y, x_tot, label = 'total')

plt.legend()
ax.set(xlabel='Year', ylabel='Occurence',
       title='Occurence of the words over time (1970-2020)')
ax.grid()
# two spikes in 1979-1982 and 2015-2018 with high peaks in 1982, and 2015-2016
plt.show()

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(y, x1, label = "refugee")
ax.plot(y, x2, label = 'emigrant')
ax.plot(y, x3, label = 'migration')
ax.plot(y, x4, label = 'migrant')
ax.plot(y, x5, label = 'immigration')
ax.plot(y, x6, label = 'immigrant')
plt.legend()
ax.set(xlabel='Year', ylabel='Occurence',
       title='Occurence of the words over time (1970-2020)')
ax.grid()

plt.show()
# two spikes in 1979-1982 and 2015-2018 with high peaks in 1982, and 2015-2016

In [None]:
year = 2018
title='Occurence of words related to Refugees by Country in '+year
sorted_total = sorted(total_occ[year], key = lambda item: item['Occurence'], reverse=True)[:20]
labels = [sorted_total[i]['Country'] for i in range(len(sorted_total))]
values = [sorted_total[i]['Occurence'] for i in range(len(sorted_total))]

fig, ax = plt.subplots()
ax.bar(labels, values)
ax.set(ylabel='Occurence',
       title=title)
fig.set_figwidth(10)
plt.show()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

year = 2018
sia = SentimentIntensityAnalyzer()

sorted_total = sorted(total_occ[year], key = lambda item: item['Occurence'], reverse=True)[:20]
labels = [sorted_total[i]['Country'] for i in range(len(sorted_total))]
values = [sia.polarity_scores(df_un_merged.loc(axis=0)[year,country]["Speech"])['neg'] for country in labels]

fig, ax = plt.subplots()
ax.bar(labels, values)
ax.set(ylabel='Occurence',
       title='Occurence of words related to Refugees Vs Number of Refugees between 1970 and 2020')
fig.set_figwidth(10)
plt.show()


In [None]:
df_ref = pd.read_csv('population.csv', skiprows=14)
df_ref = df_ref[['Year', 'Country of origin (ISO)', 'Country of asylum (ISO)', 'Refugees under UNHCR\'s mandate']].set_index(["Year"])
x_ref = list(df_ref.groupby(['Year']).sum()['Refugees under UNHCR\'s mandate'])

fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Year')
ax1.set_title('Occurence of words related to Refugees Vs Number of Refugees')
ax1.set_ylabel('Occurences of words related to Refugees', color=color)
ax1.plot(range(1970,2021), x_tot, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('number of Refugees (1e7)', color=color)  # we already handled the x-label with ax1
ax2.plot(range(1970,2021), x_ref, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()


In [None]:
fig.savefig('Occurence of words related to Refugees Vs Number of Refugees.png')