In [1]:
import pandas as pd
from functools import reduce

# Let's explore the dataset 

## Number of refugees 

In [168]:
num_ref = pd.read_csv('Data/asylum_seekers/population.csv', skiprows=14)
#we are only interested in number of refugees
num_ref = num_ref[num_ref.columns[[0,1,2,5]]]
num_ref.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Refugees under UNHCR's mandate
0,1996,Afghanistan,AFG,2674234
1,1996,Albania,ALB,5784
2,1996,Algeria,DZA,2245
3,1996,Angola,AGO,249686
4,1996,Egypt,EGY,1198


## Total populations numbers 

In [177]:
pop = pd.read_csv('Data/Total_Population/Population.csv', skiprows=4, encoding = "ISO-8859-1")
#exclude years until 1996
rem_cols = [c for c in pop.columns if c[:2] == "19"][:-4]
pop.drop(columns = rem_cols, inplace = True)
#Drop indicator code and indicator name
rem_cols = [c for c in pop.columns if c[:3] == "Ind"]
pop.drop(columns = rem_cols, inplace = True)
#last two columns are NaN
pop.drop(columns = pop.columns[-2:], inplace = True)
#now convert columns into rows
pop = pop.melt(id_vars=["Country Name", "Country Code"], 
        var_name="Year", 
        value_name="Population")
pop = pop.sort_values(by=['Country Name', 'Year'])# we can sort by country or by year
pop.head()

Unnamed: 0,Country Name,Country Code,Year,Population
1,Afghanistan,AFG,1996,18853437.0
265,Afghanistan,AFG,1997,19357126.0
529,Afghanistan,AFG,1998,19737765.0
793,Afghanistan,AFG,1999,20170844.0
1057,Afghanistan,AFG,2000,20779953.0


## Disasters 

In [233]:
#Here we will use only the number of deaths summed up over one year, Nan values are replaced with zero
disasters = pd.read_excel('Data/Disaster/disasters.xlsx', engine='openpyxl', skiprows = 6)
disasters['Country Code'] = [code[-3:] for code in disasters['Dis No']]
#move country code to the first position
cols = disasters.columns.tolist()
cols.insert(1, cols.pop(-1))# move it to the right position
disasters = disasters[cols]
#We sum all the disasters up for one year
#to get the best representation we will use total deaths and total affected since deaths are not included in that
tot_aff = disasters.groupby(['Country Code','Year'])['Total Affected'].sum().reset_index()
tot_deaths = disasters.groupby(['Country Code','Year'])['Total Deaths'].sum().reset_index()
disasters = pd.concat([tot_aff, tot_deaths], axis = 1)
#remove columns that occer twice
disasters = tot_aff.join(tot_deaths['Total Deaths'])
disasters.head()

Unnamed: 0,Country Code,Year,Total Affected,Total Deaths
0,AFG,1996,13230.0,130.0
1,AFG,1997,20830.0,229.0
2,AFG,1998,165836.0,7353.0
3,AFG,1999,113162.0,205.0
4,AFG,2000,2582268.0,634.0


## Human Development Index 

In [310]:
hdi = pd.read_csv('Data/HDI/HDI.csv', skiprows=5,encoding = "ISO-8859-1")
#Remove all unnamed columns
cols = [c for c in hdi.columns if c[:7] != 'Unnamed']
hdi = hdi[cols]
hdi.head()

Unnamed: 0,HDI Rank,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,169,Afghanistan,0.302,0.307,0.316,0.312,0.307,0.331,0.335,0.339,...,0.472,0.477,0.489,0.496,0.5,0.5,0.502,0.506,0.509,0.511
1,69,Albania,0.650,0.631,0.615,0.618,0.624,0.637,0.646,0.645,...,0.745,0.764,0.775,0.782,0.787,0.788,0.788,0.79,0.792,0.795
2,91,Algeria,0.572,0.576,0.582,0.586,0.590,0.595,0.602,0.611,...,0.721,0.728,0.728,0.729,0.736,0.74,0.743,0.745,0.746,0.748
3,36,Andorra,..,..,..,..,..,..,..,..,...,0.837,0.836,0.858,0.856,0.863,0.862,0.866,0.863,0.867,0.868
4,148,Angola,..,..,..,..,..,..,..,..,...,0.517,0.533,0.544,0.555,0.565,0.572,0.578,0.582,0.582,0.581


As we can see for some countries the hdi index is missing for earlier years, so either exclude this as training samples or
replace it with mean?? which is done below

In [311]:
year_cols = hdi.columns[2:]
hdi[year_cols] = hdi[year_cols].apply(pd.to_numeric, errors = 'coerce')
hdi = hdi.T.fillna(hdi.mean(axis=1)).T
#now convert columns into rows
hdi = hdi.melt(id_vars=["HDI Rank", "Country"], 
        var_name="Year", 
        value_name="HDI")
hdi = hdi.sort_values(by=['Country', 'Year'])
hdi.head()

Unnamed: 0,HDI Rank,Country,Year,HDI
0,169,Afghanistan,1990,0.302
207,169,Afghanistan,1991,0.307
414,169,Afghanistan,1992,0.316
621,169,Afghanistan,1993,0.312
828,169,Afghanistan,1994,0.307


In [319]:
hdi['Country'].unique()[:5] #some countries have white spaces in the name!!!

array([' Afghanistan', ' Albania', ' Algeria', ' Andorra', ' Angola'],
      dtype=object)

In [323]:
hdi['Country'] = hdi['Country'].str.replace(' ', '')

In [325]:
hdi['Country'].unique()[:5] #thats better

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola'],
      dtype=object)

We have also to add ISO 3 letter country code to concatenate the dataframes

## Precipitation (maybe exclude that) 

In [328]:
precipitation = pd.read_csv('Data/Precipitation/Precipitation.csv', skiprows=4)
#precipitation.head() #almost only nan values, probably better to exclude or find new dataset

## Temperature 

In [331]:
temp = pd.read_csv('Data/Temperature/Temp_noflags.csv',encoding = "ISO-8859-1")[:8398]#last entry that contains country info
#temperature change not absolute values
#exclude years before 1996
rem_cols = [c for c in temp.columns if c[:3] == "Y19"][:-4]
temp.drop(columns = rem_cols, inplace = True)
#Remove the Y in front of years
columns_old = temp.columns[7:]
columns_new = [c[1:] for c in temp.columns[7:]]
temp.rename(columns = dict(zip(columns_old, columns_new)),
          inplace=True, errors='raise')
#temp
temp['Months'].unique()#here we will have to sum over the months/seasons to get annualy data

array(['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December',
       'Dec\x96Jan\x96Feb', 'Mar\x96Apr\x96May', 'Jun\x96Jul\x96Aug',
       'Sep\x96Oct\x96Nov', 'Meteorological year'], dtype=object)

## Governance Indices 

In [330]:
wgi = pd.read_csv('Data/WGI/WGIData.csv')
#last column is unnamed and empty
wgi.drop(columns = wgi.columns[-1:], inplace = True)
#years 1997 and 2001 are missing, lets average over the following and preceding year
wgi['1997'] = wgi[['1996', '1998']].mean(axis = 1)
cols = wgi.columns.tolist()
cols.insert(5, cols.pop(-1))# move it to the right position
wgi = wgi[cols]
wgi['2001'] = wgi[['2000', '2002']].mean(axis = 1)
cols = wgi.columns.tolist()
cols.insert(8, cols.pop(-1))# move it to the right position
wgi = wgi[cols]
wgi.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1996,1997,1998,2000,2001,2002,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,AFG,Control of Corruption: Estimate,CC.EST,-1.291705,-1.236276,-1.180848,-1.29538,-1.279373,-1.263366,...,-1.635723,-1.579179,-1.419888,-1.436761,-1.354784,-1.341994,-1.526352,-1.517361,-1.496834,-1.401076
1,Afghanistan,AFG,Control of Corruption: Number of Sources,CC.NO.SRC,2.0,2.0,2.0,2.0,2.0,2.0,...,9.0,9.0,10.0,11.0,11.0,11.0,10.0,10.0,10.0,10.0
2,Afghanistan,AFG,Control of Corruption: Percentile Rank,CC.PER.RNK,4.301075,7.047445,9.793815,5.076142,5.063324,5.050505,...,0.952381,0.947867,2.369668,1.895735,5.288462,6.25,3.365385,3.846154,4.326923,6.730769
3,Afghanistan,AFG,"Control of Corruption: Percentile Rank, Lower ...",CC.PER.RNK.LOWER,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.480769,1.442308,0.0,0.0,0.0,1.923077
4,Afghanistan,AFG,"Control of Corruption: Percentile Rank, Upper ...",CC.PER.RNK.UPPER,27.41936,29.43133,31.4433,29.44162,30.6299,31.81818,...,3.809524,5.687204,11.84834,9.952606,12.5,12.5,9.615385,9.615385,9.615385,12.01923


This has also to be formatted like the other dataframes, Indicator Name should become an index and the years should be all in one column