# Imports

In [259]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from helper_metrics import count_missing_district, count_missing_district_total
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings("ignore")

# Load data

In [260]:
df = pd.read_csv("data/semiyearly_chosen_columns.csv").iloc[:,1:]

# Perform Imputations

In [261]:
def impute_values(df):
    # Define a subset X that will only be used for conflict imputation
    X = df.drop(["date","increase","Average of centx","Price of water", "Average of centy"],axis=1)

    # KNN for conflicts (district-wise imputation)
    knn_imputer = KNNImputer(n_neighbors=5)
    num_districts = len(df.district.unique())

    for i in range(num_districts):
        # retrieve district name
        name = X.district[i]

        # retrieve conflict data
        data = X[X.district==name].drop('district',axis=1)
        id = data.index

        # retrieve indices of the missing values
        index = data[data['n_conflict_total'].isna()].index.tolist()

        # impute and fill any missing values with a reasonable estimate
        imputed_data = pd.DataFrame(knn_imputer.fit_transform(data), columns = data.columns, index=id)['n_conflict_total']

        # retrieve interpolated values at the required indices
        conflict = imputed_data[index].values.tolist()

        # change value at the indexed location
        df.loc[index,'n_conflict_total'] = conflict

    # Redefine subset X for the rest of the imputations
    X = df.select_dtypes(exclude=["object", "category"])
    knn_df = pd.DataFrame(knn_imputer.fit_transform(X), columns = X.columns)
    ndvi_score = knn_df["ndvi_score"]
    ipc = knn_df["phase3plus_perc_x"]

    # MICE imputation
    mice_imputer = IterativeImputer(n_nearest_features=5, max_iter=100).fit_transform(X)
    price_of_water = pd.DataFrame(mice_imputer, columns=X.columns)["Price of water"]

    # Change columns to imputed features
    df["ndvi_score"] = ndvi_score
    df["phase3plus_perc_x"] = ipc
    df["Price of water"] = price_of_water

    # Rename and dropped unwanted features
    df = df.rename(columns={"ndvi_score":"ndvi", "phase3plus_perc_x":"ipc", "Price of water": "price_of_water", "n_conflict_total":"conflicts"})
    df = df.drop(["MAM","Average of centx", "Average of centy"],axis=1)
    return df



In [263]:
df_imputed = impute_values(df)

In [272]:
# Final dataset
df_imputed[df_imputed.next_prevalence.isna()==False]

Unnamed: 0,date,district,total population,Under-Five Population,GAM,SAM,GAM Prevalence,SAM Prevalence,ipc,rainfall,ndvi,price_of_water,Total alarms,conflicts,prevalence_6lag,next_prevalence,month,increase,increase_numeric,district_encoded
0,2017-07-01,Adan Yabaal,65262.96000,13052.59200,4819.01697,1085.97565,0.36920,0.08320,0.18000,19.15667,0.21500,15.00000,2.16667,2.15000,,0.35100,7,False,-0.01820,0
1,2017-07-01,Lughaye,70268.22000,14053.64400,5334.76326,1114.45397,0.37960,0.07930,0.36000,5.79500,0.09000,4.50000,2.66667,1.00000,,0.16900,7,False,-0.21060,58
2,2017-07-01,Buuhoodle,71317.71000,14263.54200,4858.16241,1205.26930,0.34060,0.08450,0.37000,19.54000,0.21333,16.17814,2.33333,2.50000,,0.20280,7,False,-0.13780,23
3,2017-07-01,Luuq,100476.76500,20095.35300,8673.15435,1306.19795,0.43160,0.06500,0.21000,24.24000,0.19333,15.00000,7.83333,1.50000,,0.39260,7,False,-0.03900,59
4,2017-07-01,Burtinle,112734.27000,22546.85400,10200.19675,1700.03279,0.45240,0.07540,0.22000,15.90333,0.18000,85.00000,3.66667,1.00000,,0.37960,7,False,-0.07280,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,2021-01-01,Qandala,52515.00000,10505.00000,3730.00000,600.00000,0.35507,0.05712,0.14000,10.92333,0.22167,14.20528,0.16667,1.00000,0.43397,0.33217,1,False,-0.02290,62
586,2021-01-01,Galdogob,72580.00000,14515.00000,6560.00000,1330.00000,0.45195,0.09163,0.20000,11.78167,0.22667,17.64099,1.16667,1.50000,0.41416,0.45734,1,True,0.00540,41
587,2021-01-01,Qoryooley,207773.00000,41555.00000,19090.00000,3135.00000,0.45939,0.07544,0.08000,19.49667,0.51167,16.16667,3.83333,6.33333,0.48013,0.45122,1,False,-0.00817,65
588,2021-01-01,Diinsoor,139564.00000,27915.00000,13745.00000,2235.00000,0.49239,0.08006,0.13000,27.38500,0.43500,33.75000,5.00000,4.40000,0.44202,0.41964,1,False,-0.07275,37
