# Imports

In [1]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from helper_metrics import count_missing_district, count_missing_district_total
import matplotlib.pyplot as plt
from helper_metrics import impute_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics import mean_squared_error
import warnings
import altair as alt
warnings.filterwarnings("ignore")

# Load data

In [2]:
id_columns = ['Price of water', 'n_conflict_total', 'ndvi_score']
df = pd.read_csv("data/semiyearly_chosen_columns.csv").iloc[:,1:]
df = df[df.next_prevalence.isna()==False]


pow_index = df[df['Price of water'].isna()==True].index
conflict_index = df[df['n_conflict_total'].isna()==True].index
ndvi_index = df[df['ndvi_score'].isna()==True].index

In [3]:
df_imputed = pd.read_csv("./imputed_data.csv").iloc[:,1:]
df_imputed = df_imputed[df_imputed.next_prevalence.isna()==False]
df

FileNotFoundError: [Errno 2] No such file or directory: './imputed_data.csv'

In [None]:
def impute_dummy(df, feature, index):
    data = df[['date','district',feature]]
    data['imputed'] = np.arange(0,len(data))
    for i in range(len(data)):
        if i in index:
            data.loc[i, 'imputed'] = 'imputed'
        else:
            data.loc[i, 'imputed'] = 'actual'
    return data

In [None]:
df_pow = impute_dummy(df_imputed, 'price_of_water', pow_index)
df_ndvi = impute_dummy(df_imputed, "ndvi", ndvi_index)
df_conlict = impute_dummy(df_imputed, "conflicts", conflict_index)

In [None]:
def dot_range(result,lower_bound,upper_bound, save_value=False):
    metric = 'accuracy'
    result = result

    lower_bound = lower_bound
    upper_bound = upper_bound
    width = 200
    # construct a dataframe
    d = {
        'metric': [metric],
        'result': [result],
        'perfect accuracy': [upper_bound],
        'random guessing': [lower_bound]
    }
    validations = pd.DataFrame(data=d)

    # store a custom domain for the chart
    domain_ = (lower_bound-0.1, upper_bound + 0.1)

    # assign circles to 'metric' column
    circles = alt.Chart(
        validations, height=50, width=width).mark_circle(
            size=105, color='#FF7F50', opacity=1).encode(
                x='result:Q', y='metric:N')

    # then make the right tick for perfect
    ticks = alt.Chart(validations).mark_tick(
        size=30, color='#1070CA', thickness=3).encode(
            x=alt.X(
                'perfect accuracy:Q',
                scale=alt.Scale(domain=domain_),
                axis=alt.Axis(grid=False, title=None)),
            y='metric:N')

    # left band for random
    bands = alt.Chart(
        validations, height=50, width=width).mark_tick(
            size=30, color='gray', opacity=1, thickness=3).encode(
                x=alt.X(
                    'random guessing:Q',
                    scale=alt.Scale(),
                    axis=alt.Axis(grid=False, title=metric, titleAnchor='middle', titleAlign='center', titleFontSize=16, tickOpacity=0.7, labels=False)),
                y=alt.Y('metric:N', axis=alt.Axis(title=None, tickOpacity=0, labels=False))
    )

    # draw a line from x1 to x2, connecting the outer bounds
    lines = alt.Chart(validations).mark_rule(
        size=5, color='gray', opacity=0.2).encode(
            x='random guessing:Q', x2='perfect accuracy:Q', y='metric:N')

    text = bands.mark_text(
        align='center',
        baseline='middle',
        dy=-20,
        color="grey"# Nudges text to right so it doesn't appear on top of the bar
    ).encode(
        text='random guessing:Q'
    )

    text2 = bands.mark_text(
        align='center',
        baseline='middle',
        dy=-20,
        dx=135,
        color="blue"# Nudges text to right so it doesn't appear on top of the bar
    ).encode(
        text='perfect accuracy:Q'
    )




    chart = bands +text2+lines+ circles + ticks + text

    return chart



In [None]:
# RMSE for CONFLICTS KNN
dot_range(0.806,0,68)

In [None]:
# RMSE for CONFLICTS SPLINE
dot_range(2.3,0,68)

In [None]:
# RMSE for IPC KNN
dot_range(0.054,0,0.58)

In [None]:
# RMSE for NDVI KNN
dot_range(0.053,0,0.61)

In [None]:
# RMSE for PRICE OF WATER MICE
dot_range(11.296,0,100)

In [None]:
df.ndvi_score.max()

In [None]:
d = pd.DataFrame({"Features":["Conflict","IPC","NDVI","Price of Water"],"Missing Values":[77,2,7,287]})
sns.barplot(d,x="Features",y="Missing Values")

# Number of missing features

In [None]:
df = pd.read_csv("data/semiyearly_chosen_columns.csv").iloc[:,1:]

In [None]:
# total observations
len(df.columns)*len(df)

In [None]:
predictors = df.select_dtypes(exclude=["object", "category"]).drop(['next_prevalence', 'increase_numeric','GAM','SAM','MAM','Under-Five Population'],axis=1)
len(predictors.columns)*len(predictors)

In [None]:
predictors.isna().sum().sum()/9184*100

In [None]:
missing = df.isna().sum().reset_index().rename(columns={"index":"Feature", 0:"Number of Missing Values"}).sort_values("Number of Missing Values",ascending=False)
missing = missing[missing['Number of Missing Values']>0].reset_index().iloc[:,1:]
missing.loc[2,'Feature'] = 'Conflicts'
missing.loc[3,'Feature'] = 'Numeric Increase in Prevalence'
missing.loc[4,'Feature'] = 'Boolean Increase in Prevalence'
missing.loc[5,'Feature'] = 'Prevalence Estimate'
missing.loc[6,'Feature'] = 'Lagged Prevalence'
missing.loc[7,'Feature'] = 'Population'
missing.loc[8,'Feature'] = 'NDVI'
missing.loc[9,'Feature'] = 'IPC'


# Turns off grid on the left Axis.
my_dpi=96
#plt.figure(figsize=(16,10))
ax = sns.barplot(missing, x="Feature", y="Number of Missing Values",palette="bwr");
ax.grid(False)
ax.spines['left'].set_visible(True)
plt.box(False)
plt.xticks(rotation=90);



plt.savefig('Images/missing.png',dpi=300, bbox_inches='tight')

In [None]:
# hsabab