In [472]:
import pandas as pd
import seaborn as sns
import dtale
from scipy import stats

In [473]:
Input0 = pd.read_csv('Data/Input.csv')
Input0.head(n=2)

Unnamed: 0,A3,Year,Control of Corruption: Estimate,Control of Corruption: Standard Error,Government Effectiveness: Estimate,Government Effectiveness: Standard Error,Political Stability and Absence of Violence/Terrorism: Estimate,Political Stability and Absence of Violence/Terrorism: Standard Error,Regulatory Quality: Estimate,Regulatory Quality: Standard Error,Rule of Law: Estimate,Rule of Law: Standard Error,Voice and Accountability: Estimate,Voice and Accountability: Standard Error,Temperature change,HDI,Population,Refugees
0,ABW,1996,,,,,,,,,,,,,0.875,,,
1,ABW,1997,,,,,,,,,,,,,0.551,,,


In [474]:
Input0.isnull().sum()

A3                                                                          0
Year                                                                        0
Control of Corruption: Estimate                                          1022
Control of Corruption: Standard Error                                    1022
Government Effectiveness: Estimate                                       1041
Government Effectiveness: Standard Error                                 1041
Political Stability and Absence of Violence/Terrorism: Estimate          1043
Political Stability and Absence of Violence/Terrorism: Standard Error    1043
Regulatory Quality: Estimate                                             1041
Regulatory Quality: Standard Error                                       1041
Rule of Law: Estimate                                                     949
Rule of Law: Standard Error                                               949
Voice and Accountability: Estimate                              

# Separate test set

In [475]:
Test_country_list = ['IRQ', 'SYR', 'LBY', 'LBN','JOR', 'SAU', 'TUN', 'THA', 'UKR']
Test_mask = Input0['A3'].isin(Test_country_list)
Test = Input0[Test_mask]
Train_val = Input0[~Test_mask]
#we are ony interested in the last period 2016-2020
Test = Test[(Test['Year']>2015) & (Test['Year']<2021)]
Test.head(2)

Unnamed: 0,A3,Year,Control of Corruption: Estimate,Control of Corruption: Standard Error,Government Effectiveness: Estimate,Government Effectiveness: Standard Error,Political Stability and Absence of Violence/Terrorism: Estimate,Political Stability and Absence of Violence/Terrorism: Standard Error,Regulatory Quality: Estimate,Regulatory Quality: Standard Error,Rule of Law: Estimate,Rule of Law: Standard Error,Voice and Accountability: Estimate,Voice and Accountability: Standard Error,Temperature change,HDI,Population,Refugees
2204,IRQ,2016,-1.386965,0.161193,-1.26753,0.208636,-2.313588,0.216167,-1.125068,0.209853,-1.625595,0.183479,-1.015599,0.131402,1.621,0.656,36610632.0,316056.0
2205,IRQ,2017,-1.372151,0.145303,-1.263284,0.212206,-2.306718,0.224583,-1.203296,0.202883,-1.637448,0.172265,-1.046867,0.132079,1.574,0.667,37552781.0,362536.0


# Remove/Replace nans, separate validation set

In [476]:
#Let us first create a lsit for the 5 time periods:
Periods = [list(range(1996,2001)), list(range(2001,2006)),
         list(range(2006,2011)), list(range(2011,2016)),
         list(range(2016,2021))]
Predict_year = [2000,2005,2010,2015,2020]

## Remove whole period for a country if target year is missing 

In [477]:
#lets remove the whole period for a country, if refugee number is not given for the end of the period(prediction year)
#first we create a list and separate the periods
Input_periods0 = []
for i in range(5):
    Input_periods0.append(Train_val[Train_val['Year'].isin(Periods[i])])

In [478]:
Input_periods1 = []
for i in range(5):
    period = Input_periods0[i]
    predict_mask = period['Year']== Predict_year[i]
    predict_years = period[predict_mask]
    missing_pred_mask = predict_years['Refugees'].isnull()
    missing_countries_list = predict_years[missing_pred_mask]['A3']
    final_mask = period['A3'].isin(missing_countries_list)
    period_clean = period[~final_mask]
    Input_periods1.append(period_clean)

## Remove nans if nan count above threshold

There are 16 features and the period has a length of 4 (without target)\
-this means half the entries is 32, let's set this as a threshold\
    if the overall nan count for a country exceeds 32 let's exclude this country\
-else we use the mean over the present entries, if one feature has no entries at all, lets replace it by the overall mean
of this period

In [479]:
Input_periods2 = []
for i in range(5):
    # except ref num no other features are needed in the target year so before counting nans, lets exclude it
    target_mask = Input_periods1[i]['Year']==Predict_year[i]
    Input_no_target = Input_periods1[i][~target_mask]
    nan_count_df = Input_no_target.drop('A3', 1).isna().groupby(Input_no_target.A3, sort=False).sum().reset_index()
    nan_count_total = nan_count_df.sum(axis = 1)
    
    nan_count_mask = nan_count_total<33#total nan countr threshold
    include_countries = nan_count_df[nan_count_mask]#we include only this countries, so now let's apply the mask
    include_countries_mask = Input_periods1[i].A3.isin(include_countries['A3'])
    Input_periods2.append(Input_periods1[i][include_countries_mask])

## Remove country if a column has only nans for a country in a period

In [480]:
Input_periods3 = []
for i in range(5):
    #again lets first exclude targets
    target_mask = Input_periods2[i]['Year']==Predict_year[i]
    Input_no_target = Input_periods2[i][~target_mask]
    nan_count_df = Input_no_target.drop('A3', 1).isna().groupby(Input_no_target.A3, sort=False).sum().reset_index()
    
    nan_count_mask = nan_count_df.iloc[:,2:]==4#mask where all 4 values are nan
    delete_rows_mask = nan_count_mask.any(axis=1)
    delete_rows = nan_count_df[delete_rows_mask]
    exclude_countries = delete_rows.A3.unique()
    exclude_countries_mask = Input_periods2[i].A3.isin(exclude_countries)
    Input_periods3.append(Input_periods2[i][~exclude_countries_mask])

## Here we should separate the validation set

Since the validation set should resemble real world data, we separate it before replacing any nan values

In [None]:
for i in range(5):
    #again lets first exclude targets
    target_mask = Input_periods3[i]['Year']==Predict_year[i]
    Input_no_target = Input_periods3[i][~target_mask]
    Input_target = Input_periods3[i][target_mask]
    #Lets separate countries that have nan entries
    

## Replace all remaining nan values with the mean over the country (within one period)

In [481]:
Input_periods4 = []
for i in range(5):
    #again lets first exclude targets
    target_mask = Input_periods3[i]['Year']==Predict_year[i]
    Input_no_target = Input_periods3[i][~target_mask]
    Input_target = Input_periods3[i][target_mask]
    #select feature columns
    feat_columns = Input_no_target.columns[2:]
    #Replace missing  values with mean
    Input_no_target[feat_columns] = Input_no_target[feat_columns].fillna(
        Input_no_target.groupby('A3')[feat_columns].transform('mean'))
    Input = pd.concat([Input_no_target, Input_target])
    Input_periods4.append(Input)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

# Data augmentation

We place the uncertainties in an extra dataframe, in case we can pass them to the loss function,
when no uncertainty is given we set it to 0. 

In [168]:
std_columns = [c for c in Input0.columns if c[-5:]=='Error']
columns = ['A3', 'Year']+std_columns
Sig0 = Input0[columns]
Sig0.head(n=3)

Unnamed: 0,A3,Year,Control of Corruption: Standard Error,Government Effectiveness: Standard Error,Political Stability and Absence of Violence/Terrorism: Standard Error,Regulatory Quality: Standard Error,Rule of Law: Standard Error,Voice and Accountability: Standard Error
0,AFG,1996,0.340507,0.187618,0.475315,0.38636,0.350509,0.261457
1,AFG,1997,0.335343,0.248948,0.455319,0.418879,0.34104,0.258773
2,AFG,1998,0.330179,0.310279,0.435324,0.451397,0.331571,0.25609


In [169]:
#Now lets set the sigmas of last 5 columns to zero
columns = Input0.columns[-5:]
column_names = [c+' std' for c in columns]
c = dict(zip(column_names, [0,0,0,0,0]))
Sig = Sig0.assign(**c)
Sig.head(n=2)

Unnamed: 0,A3,Year,Control of Corruption: Standard Error,Government Effectiveness: Standard Error,Political Stability and Absence of Violence/Terrorism: Standard Error,Regulatory Quality: Standard Error,Rule of Law: Standard Error,Voice and Accountability: Standard Error,Total Affected std,Total Deaths std,Population std,Refugees std,Country std
0,AFG,1996,0.340507,0.187618,0.475315,0.38636,0.350509,0.261457,0,0,0,0,0
1,AFG,1997,0.335343,0.248948,0.455319,0.418879,0.34104,0.258773,0,0,0,0,0


In [170]:
#Drop the uncertainty columns from Input
Input = Input0.drop(columns = Input0.columns[3:14:2])
Input.head(2)

Unnamed: 0,A3,Year,Control of Corruption: Estimate,Government Effectiveness: Estimate,Political Stability and Absence of Violence/Terrorism: Estimate,Regulatory Quality: Estimate,Rule of Law: Estimate,Voice and Accountability: Estimate,Temperature change,HDI,Total Affected,Total Deaths,Population,Refugees,Country
0,AFG,1996,-1.291705,-2.175167,-2.414042,-2.09033,-1.788075,-1.90854,0.000824,0.335,13230.0,130.0,18853437.0,2674234,Afghanistan
1,AFG,1997,-1.236276,-2.155555,-2.418519,-2.088793,-1.762608,-1.97392,0.441353,0.339,20830.0,229.0,19357126.0,2676675,Afghanistan


# Lets reshape input, target and uncertainties 
    [n_samples, timesteps, features]
    samples is always one country for a 5 year period

# Data scaling 