# Imports

In [32]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from helper_metrics import count_missing_district, count_missing_district_total
import matplotlib.pyplot as plt
from helper_metrics import impute_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

# Load data

In [33]:
df = pd.read_csv("data/semiyearly_chosen_columns.csv").iloc[:,1:]
df

Unnamed: 0,date,district,total population,Under-Five Population,GAM,MAM,SAM,GAM Prevalence,SAM Prevalence,phase3plus_perc_x,...,Total alarms,n_conflict_total,Average of centy,Average of centx,prevalence_6lag,next_prevalence,month,increase,increase_numeric,district_encoded
0,2017-07-01,Adan Yabaal,65262.96000,13052.59200,4819.01697,3733.04131,1085.97565,0.36920,0.08320,0.18000,...,2.16667,,3.54944,46.54467,,0.35100,7,False,-0.01820,0
1,2017-07-01,Lughaye,70268.22000,14053.64400,5334.76326,4220.30929,1114.45397,0.37960,0.07930,0.36000,...,2.66667,1.00000,10.64738,43.57812,,0.16900,7,False,-0.21060,58
2,2017-07-01,Buuhoodle,71317.71000,14263.54200,4858.16241,3652.89311,1205.26930,0.34060,0.08450,0.37000,...,2.33333,2.50000,8.46016,46.66129,,0.20280,7,False,-0.13780,23
3,2017-07-01,Luuq,100476.76500,20095.35300,8673.15435,7366.95641,1306.19795,0.43160,0.06500,0.21000,...,7.83333,1.50000,3.79293,42.69760,,0.39260,7,False,-0.03900,59
4,2017-07-01,Burtinle,112734.27000,22546.85400,10200.19675,8500.16396,1700.03279,0.45240,0.07540,0.22000,...,3.66667,,7.80220,48.39912,,0.37960,7,False,-0.07280,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,2021-07-01,Jariiban,,32671.60000,10890.00000,,1430.00000,0.33332,0.04377,0.19000,...,2.16667,,7.16378,48.99860,0.34857,,7,,,50
658,2021-07-01,Caluula,,16168.60000,5560.00000,,870.00000,0.34388,0.05381,0.16000,...,3.16667,1.00000,11.66822,50.79402,0.35925,,7,,,28
659,2021-07-01,Qoryooley,,25309.00087,11420.00000,,2160.00000,0.45122,0.08535,0.08000,...,6.16667,3.50000,1.93456,44.44943,0.45939,,7,,,65
660,2021-07-01,Baki,,11187.80000,3470.00000,,640.00000,0.31016,0.05721,0.37000,...,0.66667,1.00000,10.28566,43.73210,0.22769,,7,,,7


## Evaluate different imputation methods for PRICE OF WATER

In [34]:
impute_score(df,features='Price of water', method='mean', scale='2-100')

RMSE for MEAN imputation in Price of water: 11.96 
SCALE: 2-100


In [35]:
impute_score(df,features='Price of water', method='median', scale='2-100')

RMSE for MEDIAN imputation in Price of water: 11.774 
SCALE: 2-100


In [36]:
impute_score(df,features='Price of water', method='knn', scale='2-100')

RMSE for KNN imputation in Price of water: 10.942 
SCALE: 2-100


In [37]:
impute_score(df,features='Price of water', method='mice', scale='2-100')

RMSE for MICE imputation in Price of water: 11.293 
SCALE: 2-100


## Evaluate different imputation methods for NDVI

In [38]:
impute_score(df,features='ndvi_score', method='mean', scale='0 - 0.61')

RMSE for MEAN imputation in ndvi_score: 0.118 
SCALE: 0 - 0.61


In [39]:
impute_score(df,features='ndvi_score', method='median', scale='0 - 0.61')

RMSE for MEDIAN imputation in ndvi_score: 0.118 
SCALE: 0 - 0.61


In [40]:
impute_score(df,features='ndvi_score', method='knn', scale='0 - 0.61')

RMSE for KNN imputation in ndvi_score: 0.053 
SCALE: 0 - 0.61


In [41]:
impute_score(df,features='ndvi_score', method='mice', scale='0 - 0.61')

RMSE for MICE imputation in ndvi_score: 0.05 
SCALE: 0 - 0.61


## Evaluate different imputation methods for IPC

In [42]:
impute_score(df,features='phase3plus_perc_x', method='mean', scale='0 - 0.58')

RMSE for MEAN imputation in phase3plus_perc_x: 0.07 
SCALE: 0 - 0.58


In [43]:
impute_score(df,features='phase3plus_perc_x', method='median', scale='0 - 0.58')

RMSE for MEDIAN imputation in phase3plus_perc_x: 0.071 
SCALE: 0 - 0.58


In [44]:
impute_score(df,features='phase3plus_perc_x', method='knn', scale='0 - 0.58')

RMSE for KNN imputation in phase3plus_perc_x: 0.054 
SCALE: 0 - 0.58


In [45]:
impute_score(df,features='phase3plus_perc_x', method='mice', scale='0 - 0.58')

RMSE for MICE imputation in phase3plus_perc_x: 0.058 
SCALE: 0 - 0.58


## Evaluate imputation for CONFLICTS

In [46]:
impute_score(df,features='n_conflict_total', method='mean', scale='1-8')

RMSE for MEAN imputation in n_conflict_total: 4.838 
SCALE: 1-8


In [47]:
impute_score(df,features='n_conflict_total', method='median', scale='1-8')

RMSE for MEDIAN imputation in n_conflict_total: 5.015 
SCALE: 1-8


In [48]:
impute_score(df,features='n_conflict_total', method='knn', scale='1-8')

RMSE for KNN imputation in n_conflict_total: 3.592 
SCALE: 1-8


In [49]:
impute_score(df,features='n_conflict_total', method='mice', scale='1-8')

RMSE for MICE imputation in n_conflict_total: 4.293 
SCALE: 1-8


## Evaluate SPLINE interpolation for CONFLICTS

In [50]:
# create list to store district numbers which have no missing values
nonempty_conflict_districts = []
for i in range(74):
    # retrieve name
    name = df.district[i]
    # create the data
    data = df[df.district==name]['n_conflict_total']
    # check if there are any missing values and append to the list if none
    if data.isna().sum().sum()==0:
        nonempty_conflict_districts.append(i)

In [51]:
rmse_scores = []
for i in nonempty_conflict_districts:
    name = df.district[i]
    df_test = df[df.district==name][['n_conflict_total']]

    # Scale the dataframe
    scaler = MinMaxScaler()
    df_test_scaled = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)

    # Setting Feature column
    features = ['n_conflict_total']

    # Set seed for reproducibility
    np.random.seed(18)

    #  Inserting NaN values into Experiment Group
    for col in df_test_scaled[features]:
        # 20% of the data will be removed (frac = 0.2)
        # Rows may be selected more that once (replace = true) - only useful if you have more than one column in features
        df_test_scaled.loc[df_test_scaled.sample(frac=0.2, replace=True).index, col] = np.nan


    # Creating a list of indices
    nan_cols = df_test_scaled[features]
    nan_cols = nan_cols[nan_cols.isna().any(axis = 1)]
    null_idx = list(nan_cols.index)


    # Creating Answer key to compare future results against
    answer_key = df_test.iloc[null_idx]


    # Interpolate and fill any missing values with a reasonable estimate
    df_imputed = df_test_scaled.interpolate("spline", order=3).bfill()

    # Get the imputed values
    test = df_imputed.iloc[null_idx]

    # Resetting indexes of test and answer_key for iteration
    test = test.reset_index()
    test.drop(['index'], axis=1, inplace=True)
    answer_key = answer_key.reset_index()
    answer_key.drop(['index'], axis=1, inplace=True)

    # Calculate results
    results = pd.DataFrame((round((answer_key - test), 3)))

    # calculate RMSE
    squared_terms = []
    for col in results[features]:
        for i in range(len(results)):
            if results[col][i] != 0.00 or results[col][i] != -0.00:
                error = results[col][i]
                squared_error = error ** 2
                squared_terms.append(squared_error)

    num_nan = df_test_scaled.isna().sum().sum()
    sum_sqr_err = sum(squared_terms)
    mse = sum_sqr_err / num_nan
    rmse = np.round(np.sqrt(mse),3)
    rmse_scores.append(rmse)
    #return pd.DataFrame({"RMSE": rmse, "SCALE": scale},index=[0])
    print(f"RMSE for SPLINE imputation in {name} is: {rmse}")

RMSE for SPLINE imputation in Buuhoodle is: 1.048
RMSE for SPLINE imputation in Luuq is: 1.323
RMSE for SPLINE imputation in Marka is: 15.814
RMSE for SPLINE imputation in Bulo Burto is: 3.272
RMSE for SPLINE imputation in Qansax Dheere is: 1.283
RMSE for SPLINE imputation in Bossaso is: 4.401
RMSE for SPLINE imputation in Qoryooley is: 3.547
RMSE for SPLINE imputation in Belet Xaawo is: 1.323
RMSE for SPLINE imputation in Belet Weyne is: 6.683
RMSE for SPLINE imputation in Baraawe is: 2.908
RMSE for SPLINE imputation in Baraawe is: 2.908
RMSE for SPLINE imputation in Tayeeglow is: 1.658
RMSE for SPLINE imputation in Waajid is: 1.608
RMSE for SPLINE imputation in Banadir is: 57.706
RMSE for SPLINE imputation in Buur Hakaba is: 2.823
RMSE for SPLINE imputation in Wanla Weyn is: 2.957
RMSE for SPLINE imputation in Laas Caanood is: 6.23
RMSE for SPLINE imputation in Doolow is: 0.776
RMSE for SPLINE imputation in Garbahaarey is: 2.169
RMSE for SPLINE imputation in Diinsoor is: 1.811
RMSE f

In [52]:
print(f"Average RMSE for spline interpolation for conflicts: {round(np.average(rmse_scores),3)}")

Average RMSE for spline interpolation for conflicts: 5.173


# Test district-wise KNN imputation

In [56]:
rmse_scores_knn = []
for i in nonempty_conflict_districts:
    name = df.district[i]
    df_test = df[df.district==name][['n_conflict_total']]

    # Scale the dataframe
    scaler = MinMaxScaler()
    df_test_scaled = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)

    # Setting Feature column
    features = ['n_conflict_total']

    # Set seed for reproducibility
    np.random.seed(18)

    #  Inserting NaN values into Experiment Group
    for col in df_test_scaled[features]:
        # 20% of the data will be removed (frac = 0.2)
        # Rows may be selected more that once (replace = true) - only useful if you have more than one column in features
        df_test_scaled.loc[df_test_scaled.sample(frac=0.2, replace=True).index, col] = np.nan


    # Creating a list of indices
    nan_cols = df_test_scaled[features]
    nan_cols = nan_cols[nan_cols.isna().any(axis = 1)]
    null_idx = list(nan_cols.index)


    # Creating Answer key to compare future results against
    answer_key = df_test.iloc[null_idx]


    # Interpolate and fill any missing values with a reasonable estimate
    # Impute
    imputer = KNNImputer(n_neighbors=5)
    df_test_imputed = pd.DataFrame(imputer.fit_transform(df_test_scaled), columns=df_test_scaled.columns)

    # Invert scaling
    inverse_df_test_imputed = pd.DataFrame(scaler.inverse_transform(df_test_imputed), columns=df_test_imputed.columns)

    # Subset data to match that of our answer key
    test = inverse_df_test_imputed.iloc[null_idx]


    # Resetting indexes of test and answer_key for iteration
    test = test.reset_index()
    test.drop(['index'], axis=1, inplace=True)
    answer_key = answer_key.reset_index()
    answer_key.drop(['index'], axis=1, inplace=True)

    # Calculate results
    results = pd.DataFrame((round((answer_key - test), 3)))

    # calculate RMSE
    squared_terms = []
    for col in results[features]:
        for i in range(len(results)):
            if results[col][i] != 0.00 or results[col][i] != -0.00:
                error = results[col][i]
                squared_error = error ** 2
                squared_terms.append(squared_error)

    num_nan = df_test_scaled.isna().sum().sum()
    sum_sqr_err = sum(squared_terms)
    mse = sum_sqr_err / num_nan
    rmse = np.round(np.sqrt(mse),3)
    rmse_scores_knn.append(rmse)
    #return pd.DataFrame({"RMSE": rmse, "SCALE": scale},index=[0])
    print(f"RMSE for local KNN imputation in {name} is: {rmse}")

RMSE for local KNN imputation in Buuhoodle is: 0.392
RMSE for local KNN imputation in Luuq is: 0.34
RMSE for local KNN imputation in Marka is: 5.102
RMSE for local KNN imputation in Bulo Burto is: 0.501
RMSE for local KNN imputation in Qansax Dheere is: 0.482
RMSE for local KNN imputation in Bossaso is: 4.237
RMSE for local KNN imputation in Qoryooley is: 1.199
RMSE for local KNN imputation in Belet Xaawo is: 0.681
RMSE for local KNN imputation in Belet Weyne is: 1.666
RMSE for local KNN imputation in Baraawe is: 1.233
RMSE for local KNN imputation in Baraawe is: 1.233
RMSE for local KNN imputation in Tayeeglow is: 0.251
RMSE for local KNN imputation in Waajid is: 0.476
RMSE for local KNN imputation in Banadir is: 12.205
RMSE for local KNN imputation in Buur Hakaba is: 0.885
RMSE for local KNN imputation in Wanla Weyn is: 0.673
RMSE for local KNN imputation in Laas Caanood is: 4.022
RMSE for local KNN imputation in Doolow is: 0.404
RMSE for local KNN imputation in Garbahaarey is: 0.93


In [57]:
print(f"Average RMSE for district-wise KNN imputation for conflicts: {round(np.average(rmse_scores_knn),3)}")

Average RMSE for district-wise KNN imputation for conflicts: 1.729
