In [207]:
import pandas as pd
import dtale
from pandas import DataFrame

In [208]:
def dataframe_difference(df1: DataFrame, df2: DataFrame, which=None):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(
        df2,
        indicator=True,
        how='outer'
    )
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    diff_df.to_csv('data/diff.csv')
    return diff_df

# Let's explore the dataset 

## Number of refugees 

In [531]:
num_ref0 = pd.read_csv('Data/asylum_seekers/population.csv', skiprows=14)
#we are only interested in number of refugees
num_ref0 = num_ref0[num_ref0.columns[[0,1,2,5]]]
num_ref0 = num_ref0.rename(
    columns={num_ref0.columns[1]: 'Country',num_ref0.columns[2]: 'ISO', num_ref0.columns[3]:'Refugees'})
num_ref0.head()

Unnamed: 0,Year,Country,ISO,Refugees
0,1996,Afghanistan,AFG,2674234
1,1996,Albania,ALB,5784
2,1996,Algeria,DZA,2245
3,1996,Angola,AGO,249686
4,1996,Egypt,EGY,1198


In [533]:
null_mask = num_ref0['ISO'].isnull()
num_ref0[null_mask]#These are high numbers for unknown countries, but we have to drop them
num_ref = num_ref0[~null_mask]
num_ref = num_ref[num_ref.columns[[0,2,3]]]#
num_ref.head()

Unnamed: 0,Year,ISO,Refugees
0,1996,AFG,2674234
1,1996,ALB,5784
2,1996,DZA,2245
3,1996,AGO,249686
4,1996,EGY,1198


## Total populations numbers 

In [534]:
pop0 = pd.read_csv('Data/Total_Population/Population.csv', skiprows=4, encoding = "ISO-8859-1")
#exclude years until 1996
rem_cols = [c for c in pop0.columns if c[:2] == "19"][:-4]
pop0.drop(columns = rem_cols, inplace = True)
#Drop indicator code and indicator name
rem_cols = [c for c in pop0.columns if c[:3] == "Ind"]
pop0.drop(columns = rem_cols, inplace = True)
#last two columns are NaN
pop0.drop(columns = pop0.columns[-2:], inplace = True)
#now convert columns into rows
pop0 = pop0.melt(id_vars=["Country Name", "Country Code"], 
        var_name="Year", 
        value_name="Population")
pop0 = pop0.sort_values(by=['Country Name', 'Year'])# we can sort by country or by year
pop0 = pop0.rename(
    columns={pop0.columns[0]: 'Country',pop0.columns[1]: 'ISO'})
pop0.head()

Unnamed: 0,Country,ISO,Year,Population
1,Afghanistan,AFG,1996,18853437.0
265,Afghanistan,AFG,1997,19357126.0
529,Afghanistan,AFG,1998,19737765.0
793,Afghanistan,AFG,1999,20170844.0
1057,Afghanistan,AFG,2000,20779953.0


In [535]:
#turn years into integers
pop0['Year'] = pop0['Year'].apply(pd.to_numeric, errors = 'coerce')

In [536]:
#a`re there country codes which are nan?
np.count_nonzero(pop0['ISO'].isnull())
#nope we can drop the Country column
pop = pop0.drop(columns='Country')
pop.head()

Unnamed: 0,ISO,Year,Population
1,AFG,1996,18853437.0
265,AFG,1997,19357126.0
529,AFG,1998,19737765.0
793,AFG,1999,20170844.0
1057,AFG,2000,20779953.0


In [537]:
pop['ISO'].nunique()

264

In [538]:
Input0 = pop.merge(num_ref, how = 'inner', on = ['ISO', 'Year']) #this dataframe contains all the input
Input0.head()

Unnamed: 0,ISO,Year,Population,Refugees
0,AFG,1996,18853437.0,2674234
1,AFG,1997,19357126.0,2676675
2,AFG,1998,19737765.0,2667118
3,AFG,1999,20170844.0,2601690
4,AFG,2000,20779953.0,3587327


## Disasters 

In [540]:
#Here we will use only the number of deaths summed up over one year, Nan values are replaced with zero
disasters = pd.read_excel('Data/Disaster/disasters.xlsx', engine='openpyxl', skiprows = 6)
disasters['Country Code'] = [code[-3:] for code in disasters['Dis No']]
#move country code to the first position
cols = disasters.columns.tolist()
cols.insert(1, cols.pop(-1))# move it to the right position
disasters = disasters[cols]
#We sum all the disasters up for one year
#to get the best representation we will use total deaths and total affected since deaths are not included in that
tot_aff = disasters.groupby(['Country Code','Year'])['Total Affected'].sum().reset_index()
tot_deaths = disasters.groupby(['Country Code','Year'])['Total Deaths'].sum().reset_index()
disasters = pd.concat([tot_aff, tot_deaths], axis = 1)
#remove columns that occer twice
disasters = tot_aff.join(tot_deaths['Total Deaths'])
disasters = disasters.rename(
    columns={disasters.columns[0]: 'ISO'})
disasters.head()

Unnamed: 0,ISO,Year,Total Affected,Total Deaths
0,AFG,1996,13230.0,130.0
1,AFG,1997,20830.0,229.0
2,AFG,1998,165836.0,7353.0
3,AFG,1999,113162.0,205.0
4,AFG,2000,2582268.0,634.0


In [None]:
disasters['ISO'].nunique()

In [None]:
Input1 = disasters.merge(Input0, how = 'inner', on = ['ISO', 'Year'])
Input1.head()

In [507]:
Input1['ISO'].nunique()

193

## Human Development Index 

In [541]:
hdi0 = pd.read_csv('Data/HDI/HDI.csv', skiprows=5,encoding = "ISO-8859-1")
#Remove all unnamed columns
cols = [c for c in hdi0.columns if (c[:7] != 'Unnamed' and c!='HDI Rank')]
hdi0 = hdi0[cols]
hdi0.head()

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,0.302,0.307,0.316,0.312,0.307,0.331,0.335,0.339,0.344,...,0.472,0.477,0.489,0.496,0.5,0.5,0.502,0.506,0.509,0.511
1,Albania,0.650,0.631,0.615,0.618,0.624,0.637,0.646,0.645,0.655,...,0.745,0.764,0.775,0.782,0.787,0.788,0.788,0.79,0.792,0.795
2,Algeria,0.572,0.576,0.582,0.586,0.590,0.595,0.602,0.611,0.621,...,0.721,0.728,0.728,0.729,0.736,0.74,0.743,0.745,0.746,0.748
3,Andorra,..,..,..,..,..,..,..,..,..,...,0.837,0.836,0.858,0.856,0.863,0.862,0.866,0.863,0.867,0.868
4,Angola,..,..,..,..,..,..,..,..,..,...,0.517,0.533,0.544,0.555,0.565,0.572,0.578,0.582,0.582,0.581


As we can see for some countries the hdi index is missing for earlier years, so either exclude this as training samples or
replace it with mean?? which is done below

In [542]:
year_cols = hdi0.columns[1:]
hdi0[year_cols] = hdi0[year_cols].apply(pd.to_numeric, errors = 'coerce')
hdi1 = hdi0.T.fillna(hdi0.mean(axis=1)).T
#now convert columns into rows
hdi1 = hdi1.melt(id_vars=["Country"], 
        var_name="Year", 
        value_name="HDI")
hdi1 = hdi1.sort_values(by=['Country', 'Year'])
hdi1.head()

Unnamed: 0,Country,Year,HDI
0,Afghanistan,1990,0.302
207,Afghanistan,1991,0.307
414,Afghanistan,1992,0.316
621,Afghanistan,1993,0.312
828,Afghanistan,1994,0.307


In [543]:
hdi1['Country'].unique()[:5] #some countries have white spaces in the name!!!
hdi1['Country'] = hdi1['Country'].str.replace(' ', '')#replace that

In [545]:
#merge to see which country entries do not match
mergedStuff = pd.merge(hdi1, alpha3, on=['Country'], how='inner')
mergedStuff.head()
merged_countries = mergedStuff['Country'].unique()

In [546]:
#find other countries:
countries_in_both_mask= hdi1['Country'].isin(merged_countries)
diff_countries = hdi1[~countries_in_both_mask]['Country'].unique()
#the last 17 entries are not countries!
diff_countries = diff_countries[:-17]

In [547]:
#Countries that have a different string in hdi
ISO_country = [
    'Antigua and Barbuda', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 
    'Brunei Darussalam', 'Burkina Faso', 'Cabo Verde', 'Central African Republic',  
    'Congo, Democratic Republic of the', 'Costa Rica', "Côte d'Ivoire", 
    'Dominican Republic', 'El Salvador', 'Equatorial Guinea',
    'Eswatini', 'Hong Kong', 'Iran (Islamic Republic of)', 
    "Korea (Democratic People's Republic of)", 'Korea, Republic of',
    "Lao People's Democratic Republic",  'Marshall Islands',
    'Micronesia (Federated States of)', 'Moldova, Republic of', 'New Zealand', 
    'North Macedonia', 'Palestine, State of', 'Papua New Guinea', 'Russian Federation', 
    'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines',
    'Sao Tome and Principe', 'Saudi Arabia', 'Sierra Leone', 'Solomon Islands', 
    'South Africa', 'South Sudan', 'Sri Lanka', 'Syrian Arab Republic', 
    'Tanzania, United Republic of', 'Trinidad and Tobago', 'United Arab Emirates',
    'United Kingdom of Great Britain and Northern Ireland','United States of America',
    'Venezuela (Bolivarian Republic of)', 'Viet Nam']

In [548]:
#Replace non-matching countries with entries from the alpha3 df
di = dict(zip(diff_countries,ISO_country))
hdi1['Country'] = hdi1['Country'].map(di).fillna(hdi1['Country'])
hdi1.head()

Unnamed: 0,Country,Year,HDI
0,Afghanistan,1990,0.302
207,Afghanistan,1991,0.307
414,Afghanistan,1992,0.316
621,Afghanistan,1993,0.312
828,Afghanistan,1994,0.307


In [552]:
#Lets add ISO code
#Lets add country code
hdi = hdi1.merge(alpha3, on = 'Country')#Merge by country such that now ISO is in the table
hdi = hdi.iloc[:,1:]#Deleta Country column
hdi.head()

Unnamed: 0,Year,HDI,ISO
0,1990,0.302,AFG
1,1991,0.307,AFG
2,1992,0.316,AFG
3,1993,0.312,AFG
4,1994,0.307,AFG


In [553]:
hdi['ISO'].nunique()#not that many countries

190

In [554]:
#turn years into integers
hdi['Year'] = hdi['Year'].apply(pd.to_numeric, errors = 'coerce')

In [555]:
Input2 = hdi.merge(Input1, how = 'inner', on = ['ISO', 'Year'])
Input2.head()

Unnamed: 0,Year,HDI,ISO,Total Affected,Total Deaths,Population,Refugees
0,1996,0.335,AFG,13230.0,130.0,18853437.0,2674234
1,1997,0.339,AFG,20830.0,229.0,19357126.0,2676675
2,1998,0.344,AFG,165836.0,7353.0,19737765.0,2667118
3,1999,0.348,AFG,113162.0,205.0,20170844.0,2601690
4,2000,0.35,AFG,2582268.0,634.0,20779953.0,3587327


In [556]:
Input2['ISO'].nunique()

187

## Temperature 

In [557]:
temp0 = pd.read_csv('Data/Temperature/Temp_noflags.csv',encoding = "ISO-8859-1")[:8398]#last entry that contains country info
#temperature change not absolute values
#exclude years before 1996
rem_cols = [c for c in temp0.columns if c[:3] == "Y19"][:-4]
temp0.drop(columns = rem_cols, inplace = True)
#Remove the Y in front of years
columns_old = temp0.columns[7:]
columns_new = [c[1:] for c in temp0.columns[7:]]
temp0.rename(columns = dict(zip(columns_old, columns_new)),
          inplace=True, errors='raise')
#lets remove some columns
rem_cols = [c for c in temp0.columns if (c == "Unit" or c=='Element Code' or c=='Area Code')  ]
temp0.drop(columns = rem_cols, inplace = True)
temp0.head()

Unnamed: 0,Area,Months Code,Months,Element,1996,1997,1998,1999,2000,2001,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,7001,January,Temperature change,-0.718,1.426,0.95,0.859,1.565,-0.603,...,3.601,1.179,-0.583,1.233,1.755,1.943,3.416,1.201,1.996,2.951
1,Afghanistan,7001,January,Standard Deviation,1.95,1.95,1.95,1.95,1.95,1.95,...,1.95,1.95,1.95,1.95,1.95,1.95,1.95,1.95,1.95,1.95
2,Afghanistan,7002,February,Temperature change,0.87,0.043,-0.54,3.222,-0.901,0.707,...,1.212,0.321,-3.201,1.494,-3.187,2.699,2.251,-0.323,2.705,0.086
3,Afghanistan,7002,February,Standard Deviation,2.597,2.597,2.597,2.597,2.597,2.597,...,2.597,2.597,2.597,2.597,2.597,2.597,2.597,2.597,2.597,2.597
4,Afghanistan,7003,March,Temperature change,-0.336,-0.005,-0.576,-0.217,-0.267,1.229,...,3.39,0.748,-0.527,2.246,-0.076,-0.497,2.296,0.834,4.418,0.234


In [558]:
temp0['Area'].nunique()

247

In [559]:
#let's take years as one column
temp1 = temp0.melt(id_vars=temp0.columns[:4], 
        var_name="Year", 
        value_name="Value")
temp1.head()

Unnamed: 0,Area,Months Code,Months,Element,Year,Value
0,Afghanistan,7001,January,Temperature change,1996,-0.718
1,Afghanistan,7001,January,Standard Deviation,1996,1.95
2,Afghanistan,7002,February,Temperature change,1996,0.87
3,Afghanistan,7002,February,Standard Deviation,1996,2.597
4,Afghanistan,7003,March,Temperature change,1996,-0.336


In [560]:
#the elment column entries should be columns
temp2 = temp1.set_index(['Area','Months Code','Months', 'Year', 'Element'])['Value'].unstack()
#move the multiindex one down
temp2 = temp2.reset_index().rename_axis(None).rename_axis(None, axis=1)
#rename area to country
temp2 = temp2.rename(columns={temp2.columns[0]: 'Country'})
#remove std column
rem_cols = [c for c in temp2.columns if c == "Standard Deviation"]
temp2.drop(columns = rem_cols, inplace = True)
temp2.head()

Unnamed: 0,Country,Months Code,Months,Year,Temperature change
0,Afghanistan,7001,January,1996,-0.718
1,Afghanistan,7001,January,1997,1.426
2,Afghanistan,7001,January,1998,0.95
3,Afghanistan,7001,January,1999,0.859
4,Afghanistan,7001,January,2000,1.565


In [561]:
temp3 = temp2.merge(alpha3, on = 'Country')

In [562]:
temp3.head()

Unnamed: 0,Country,Months Code,Months,Year,Temperature change,ISO
0,Afghanistan,7001,January,1996,-0.718,AFG
1,Afghanistan,7001,January,1997,1.426,AFG
2,Afghanistan,7001,January,1998,0.95,AFG
3,Afghanistan,7001,January,1999,0.859,AFG
4,Afghanistan,7001,January,2000,1.565,AFG


In [566]:
temp = temp3.groupby(['ISO', 'Year'])['Temperature change'].mean().reset_index()

In [568]:
temp.head()

Unnamed: 0,ISO,Year,Temperature change
0,ABW,1996,0.787118
1,ABW,1997,0.644059
2,ABW,1998,1.117824
3,ABW,1999,0.239231
4,ABW,2000,0.128294


In [570]:
temp['ISO'].nunique()

217

In [572]:
#turn years into integers
temp['Year'] = temp['Year'].apply(pd.to_numeric, errors = 'coerce')

In [573]:
Input3 = temp.merge(Input2, how = 'inner', on = ['ISO', 'Year'])
Input3.head()

Unnamed: 0,ISO,Year,Temperature change,HDI,Total Affected,Total Deaths,Population,Refugees
0,AFG,1996,0.000824,0.335,13230.0,130.0,18853437.0,2674234
1,AFG,1997,0.441353,0.339,20830.0,229.0,19357126.0,2676675
2,AFG,1998,0.818,0.344,165836.0,7353.0,19737765.0,2667118
3,AFG,1999,1.082647,0.348,113162.0,205.0,20170844.0,2601690
4,AFG,2000,0.982882,0.35,2582268.0,634.0,20779953.0,3587327


## Governance Indices 

In [583]:
wgi0 = pd.read_csv('Data/WGI/WGIData.csv')
#last column is unnamed and empty
wgi0.drop(columns = wgi0.columns[-1:], inplace = True)
#years 1997 and 2001 are missing, lets average over the following and preceding year
wgi0['1997'] = wgi0[['1996', '1998']].mean(axis = 1)
cols = wgi0.columns.tolist()
cols.insert(5, cols.pop(-1))# move it to the right position
wgi0 = wgi0[cols]
wgi0['2001'] = wgi0[['2000', '2002']].mean(axis = 1)
cols = wgi0.columns.tolist()
cols.insert(8, cols.pop(-1))# move it to the right position
wgi0 = wgi0[cols]
wgi0.drop(columns = wgi0.columns[[0,3]], inplace = True)
#wgi0 = wgi0.rename(columns={wgi0.columns[0]: 'ISO'})
wgi0.head()

Unnamed: 0,Country Code,Indicator Name,1996,1997,1998,2000,2001,2002,2003,2004,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,AFG,Control of Corruption: Estimate,-1.291705,-1.236276,-1.180848,-1.29538,-1.279373,-1.263366,-1.351042,-1.345281,...,-1.635723,-1.579179,-1.419888,-1.436761,-1.354784,-1.341994,-1.526352,-1.517361,-1.496834,-1.401076
1,AFG,Control of Corruption: Number of Sources,2.0,2.0,2.0,2.0,2.0,2.0,3.0,5.0,...,9.0,9.0,10.0,11.0,11.0,11.0,10.0,10.0,10.0,10.0
2,AFG,Control of Corruption: Percentile Rank,4.301075,7.047445,9.793815,5.076142,5.063324,5.050505,5.050505,5.853659,...,0.952381,0.947867,2.369668,1.895735,5.288462,6.25,3.365385,3.846154,4.326923,6.730769
3,AFG,"Control of Corruption: Percentile Rank, Lower ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.480769,1.442308,0.0,0.0,0.0,1.923077
4,AFG,"Control of Corruption: Percentile Rank, Upper ...",27.41936,29.43133,31.4433,29.44162,30.6299,31.81818,18.18182,14.14634,...,3.809524,5.687204,11.84834,9.952606,12.5,12.5,9.615385,9.615385,9.615385,12.01923


In [584]:
wgi0['1996'].isnull()
#there are nan values, replace with mean over years
#!this might be not the best approach, maybe just setting estimates to a constant and std very high is better
#if it can be included in training
wgi1 = wgi0.T.fillna(wgi0.mean(axis=1)).T

In [586]:
#let's take years as one column
wgi2 = wgi1.melt(id_vars=wgi1.columns[:2], 
        var_name="Year", 
        value_name="Value")
wgi2.head()

Unnamed: 0,Country Code,Indicator Name,Year,Value
0,AFG,Control of Corruption: Estimate,1996,-1.2917
1,AFG,Control of Corruption: Number of Sources,1996,2.0
2,AFG,Control of Corruption: Percentile Rank,1996,4.30107
3,AFG,"Control of Corruption: Percentile Rank, Lower ...",1996,0.0
4,AFG,"Control of Corruption: Percentile Rank, Upper ...",1996,27.4194


In [587]:
#the elment column entries should be columns
wgi3 = wgi2.set_index(['Country Code','Year','Indicator Name'])['Value'].unstack()
#move the multiindex one down
wgi3 = wgi3.reset_index().rename_axis(None).rename_axis(None, axis=1)
wgi3.head()

Unnamed: 0,Country Code,Year,Control of Corruption: Estimate,Control of Corruption: Number of Sources,Control of Corruption: Percentile Rank,"Control of Corruption: Percentile Rank, Lower Bound of 90% Confidence Interval","Control of Corruption: Percentile Rank, Upper Bound of 90% Confidence Interval",Control of Corruption: Standard Error,Government Effectiveness: Estimate,Government Effectiveness: Number of Sources,...,Rule of Law: Percentile Rank,"Rule of Law: Percentile Rank, Lower Bound of 90% Confidence Interval","Rule of Law: Percentile Rank, Upper Bound of 90% Confidence Interval",Rule of Law: Standard Error,Voice and Accountability: Estimate,Voice and Accountability: Number of Sources,Voice and Accountability: Percentile Rank,"Voice and Accountability: Percentile Rank, Lower Bound of 90% Confidence Interval","Voice and Accountability: Percentile Rank, Upper Bound of 90% Confidence Interval",Voice and Accountability: Standard Error
0,ABW,1996,1.20168,1.6875,86.0092,73.9429,92.5533,0.327237,1.16018,1.6875,...,83.9676,72.7417,92.0574,0.277894,1.18391,1.3125,87.7275,68.2217,98.9847,0.312097
1,ABW,1997,1.20168,1.6875,86.0092,73.9429,92.5533,0.327237,1.16018,1.6875,...,83.9676,72.7417,92.0574,0.277894,1.18391,1.3125,87.7275,68.2217,98.9847,0.312097
2,ABW,1998,1.20168,1.6875,86.0092,73.9429,92.5533,0.327237,1.16018,1.6875,...,83.9676,72.7417,92.0574,0.277894,1.18391,1.3125,87.7275,68.2217,98.9847,0.312097
3,ABW,2000,1.20168,1.6875,86.0092,73.9429,92.5533,0.327237,1.16018,1.6875,...,83.9676,72.7417,92.0574,0.277894,1.18391,1.3125,87.7275,68.2217,98.9847,0.312097
4,ABW,2001,1.20168,1.6875,86.0092,73.9429,92.5533,0.327237,1.16018,1.6875,...,83.9676,72.7417,92.0574,0.277894,1.18391,1.3125,87.7275,68.2217,98.9847,0.312097


In [597]:
wgi = wgi3.rename(columns={wgi.columns[0]: 'ISO', })

In [598]:
#lets drop some columns
#remove confidence intervals and number of sources
wgi.drop(columns = wgi.columns[[3,5,6, 9,11, 12,15, 17, 18,21, 23, 24,27, 29, 30,33, 35, 36]], inplace = True)

In [599]:
wgi.head()

Unnamed: 0,ISO,Year,Control of Corruption: Estimate,Control of Corruption: Percentile Rank,Control of Corruption: Standard Error,Government Effectiveness: Estimate,Government Effectiveness: Percentile Rank,Government Effectiveness: Standard Error,Political Stability and Absence of Violence/Terrorism: Estimate,Political Stability and Absence of Violence/Terrorism: Percentile Rank,Political Stability and Absence of Violence/Terrorism: Standard Error,Regulatory Quality: Estimate,Regulatory Quality: Percentile Rank,Regulatory Quality: Standard Error,Rule of Law: Estimate,Rule of Law: Percentile Rank,Rule of Law: Standard Error,Voice and Accountability: Estimate,Voice and Accountability: Percentile Rank,Voice and Accountability: Standard Error
0,ABW,1996,1.20168,86.0092,0.327237,1.16018,83.5523,0.35466,1.25661,93.1901,0.298642,1.12133,83.0238,0.339189,1.15985,83.9676,0.277894,1.18391,87.7275,0.312097
1,ABW,1997,1.20168,86.0092,0.327237,1.16018,83.5523,0.35466,1.25661,93.1901,0.298642,1.12133,83.0238,0.339189,1.15985,83.9676,0.277894,1.18391,87.7275,0.312097
2,ABW,1998,1.20168,86.0092,0.327237,1.16018,83.5523,0.35466,1.25661,93.1901,0.298642,1.12133,83.0238,0.339189,1.15985,83.9676,0.277894,1.18391,87.7275,0.312097
3,ABW,2000,1.20168,86.0092,0.327237,1.16018,83.5523,0.35466,1.25661,93.1901,0.298642,1.12133,83.0238,0.339189,1.15985,83.9676,0.277894,1.18391,87.7275,0.312097
4,ABW,2001,1.20168,86.0092,0.327237,1.16018,83.5523,0.35466,1.25661,93.1901,0.298642,1.12133,83.0238,0.339189,1.15985,83.9676,0.277894,1.18391,87.7275,0.312097


In [600]:
wgi['ISO'].nunique()

214

In [602]:
#turn years into integers
wgi['Year'] = wgi['Year'].apply(pd.to_numeric, errors = 'coerce')
Input = wgi.merge(Input3, how = 'inner', on = ['ISO', 'Year'])
Input.head()

Unnamed: 0,ISO,Year,Control of Corruption: Estimate,Control of Corruption: Percentile Rank,Control of Corruption: Standard Error,Government Effectiveness: Estimate,Government Effectiveness: Percentile Rank,Government Effectiveness: Standard Error,Political Stability and Absence of Violence/Terrorism: Estimate,Political Stability and Absence of Violence/Terrorism: Percentile Rank,...,Rule of Law: Standard Error,Voice and Accountability: Estimate,Voice and Accountability: Percentile Rank,Voice and Accountability: Standard Error,Temperature change,HDI,Total Affected,Total Deaths,Population,Refugees
0,AFG,1996,-1.2917,4.30107,0.340507,-2.17517,0.0,0.187618,-2.41404,2.12766,...,0.350509,-1.90854,1.0,0.261457,0.000824,0.335,13230.0,130.0,18853437.0,2674234
1,AFG,1997,-1.23628,7.04744,0.335343,-2.15555,0.0,0.248948,-2.41852,1.32979,...,0.34104,-1.97392,0.748756,0.258773,0.441353,0.339,20830.0,229.0,19357126.0,2676675
2,AFG,1998,-1.18085,9.79381,0.330179,-2.13594,0.0,0.310279,-2.423,0.531915,...,0.331571,-2.0393,0.497512,0.25609,0.818,0.344,165836.0,7353.0,19737765.0,2667118
3,AFG,2000,-1.29538,5.07614,0.356294,-2.23165,0.0,0.343066,-2.44019,0.5291,...,0.294017,-2.03142,0.995025,0.254043,0.982882,0.35,2582268.0,634.0,20779953.0,3587327
4,AFG,2001,-1.27937,5.06332,0.358104,-1.89981,1.02041,0.302807,-2.23779,1.0582,...,0.296712,-1.73242,5.22388,0.221748,1.383588,0.353,204695.0,507.0,21606988.0,3809763


In [605]:
Input['ISO'].nunique()

179

# Preprocess

Since country names may vary, we try to convert it to alpha3 codes and use them to concatenate the tables.
Those that will not be assigned an alpha3 code automatically we have to insert manually.