# Imports

In [1]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from helper_metrics import count_missing_district, count_missing_district_total
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings("ignore")

# Load data

In [2]:
df = pd.read_csv("data/semiyearly_chosen_columns.csv").iloc[:,1:]

# Perform Imputations

In [3]:
def impute_values(df):
    # Define a subset X that will only be used for conflict imputation
    X = df.drop(["date","increase","Average of centx","Price of water", "Average of centy"],axis=1)

    # KNN for conflicts (district-wise imputation)
    knn_imputer = KNNImputer(n_neighbors=5)
    num_districts = len(df.district.unique())

    for i in range(num_districts):
        # retrieve district name
        name = X.district[i]

        # retrieve conflict data
        data = X[X.district==name].drop('district',axis=1)
        id = data.index

        # retrieve indices of the missing values
        index = data[data['n_conflict_total'].isna()].index.tolist()

        # impute and fill any missing values with a reasonable estimate
        imputed_data = pd.DataFrame(knn_imputer.fit_transform(data), columns = data.columns, index=id)['n_conflict_total']

        # retrieve interpolated values at the required indices
        conflict = imputed_data[index].values.tolist()

        # change value at the indexed location
        df.loc[index,'n_conflict_total'] = conflict


    # Redefine subset X for the rest of the imputations
    X = df.select_dtypes(exclude=["object", "category"])
    knn_df = pd.DataFrame(knn_imputer.fit_transform(X), columns = X.columns)
    ndvi_score = knn_df["ndvi_score"]
    ipc = knn_df["phase3plus_perc_x"]

    # MICE imputation
    mice_imputer = IterativeImputer(n_nearest_features=5, max_iter=100).fit_transform(X)
    price_of_water = pd.DataFrame(mice_imputer, columns=X.columns)["Price of water"]

    # Change columns to imputed features
    df["ndvi_score"] = ndvi_score
    df["phase3plus_perc_x"] = ipc
    df["Price of water"] = price_of_water

    # Rename and dropped unwanted features
    df = df.rename(columns={"ndvi_score":"ndvi", "phase3plus_perc_x":"ipc", "Price of water": "price_of_water", "n_conflict_total":"conflicts"})
    df = df.drop(["MAM","Average of centx", "Average of centy"],axis=1)
    return df



In [4]:
df_imputed = impute_values(df)

In [5]:
# Final dataset
df_imputed[df_imputed.next_prevalence.isna()==False]

Unnamed: 0,date,district,total population,Under-Five Population,GAM,SAM,GAM Prevalence,SAM Prevalence,ipc,rainfall,ndvi,price_of_water,Total alarms,conflicts,prevalence_6lag,next_prevalence,month,increase,increase_numeric,district_encoded
0,2017-07-01,Adan Yabaal,65262.96000,13052.59200,4819.01697,1085.97565,0.36920,0.08320,0.18000,19.15667,0.21500,15.00000,2.16667,2.15000,,0.35100,7,False,-0.01820,0
1,2017-07-01,Luuq,100476.76500,20095.35300,8673.15435,1306.19795,0.43160,0.06500,0.21000,24.24000,0.19333,15.00000,7.83333,1.50000,,0.39260,7,False,-0.03900,59
2,2017-07-01,Buur Hakaba,165968.46000,33193.69200,11909.89669,3711.05477,0.35880,0.11180,0.35000,34.94500,0.34833,52.50000,6.16667,4.66667,,0.28860,7,False,-0.07020,24
3,2017-07-01,Marka,282222.76500,56444.55300,20839.32897,4696.18681,0.36920,0.08320,0.17000,23.56500,0.29833,36.50000,11.83333,11.83333,,0.35100,7,False,-0.01820,60
4,2017-07-01,Buuhoodle,71317.71000,14263.54200,4858.16241,1205.26930,0.34060,0.08450,0.37000,19.54000,0.21333,15.85171,2.33333,2.50000,,0.20280,7,False,-0.13780,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,2021-01-01,Qardho,64346.00000,12870.00000,4555.00000,690.00000,0.35392,0.05361,0.23000,8.12000,0.14833,15.95630,0.16667,1.20000,0.39980,0.42264,1,True,0.06871,64
580,2021-01-01,Caluula,51766.00000,10355.00000,3720.00000,595.00000,0.35925,0.05746,0.16000,18.48000,0.23167,31.66667,3.66667,1.00000,0.47291,0.34388,1,False,-0.01537,28
581,2021-01-01,Cadale,61428.00000,12285.00000,3390.00000,435.00000,0.27595,0.03541,0.10000,19.06500,0.39667,13.78488,3.50000,2.75000,0.29052,0.28736,1,True,0.01141,27
582,2021-01-01,Afmadow,164086.00000,32815.00000,9905.00000,1660.00000,0.30184,0.05059,0.13000,23.69500,0.39500,18.16667,3.33333,8.66667,0.27809,0.39162,1,True,0.08977,2


In [11]:
df_imputed.iloc[:,2:].drop(['next_prevalence','GAM', 'SAM','increase','increase_numeric','Under-Five Population','district_encoded'],axis=1).columns

Index(['total population', 'GAM Prevalence', 'SAM Prevalence', 'ipc',
       'rainfall', 'ndvi', 'price_of_water', 'Total alarms', 'conflicts',
       'prevalence_6lag', 'month'],
      dtype='object')

In [6]:
df_imputed.to_csv("imputed_data.csv")