# Imports

In [104]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from helper_metrics import count_missing_district, count_missing_district_total
import matplotlib.pyplot as plt
from sklearn.experimental    import enable_iterative_imputer
from sklearn.impute          import IterativeImputer
from sklearn.experimental    import enable_hist_gradient_boosting
from sklearn.ensemble        import HistGradientBoostingRegressor

from sklearn.model_selection import KFold
from sklearn.metrics         import mean_absolute_error, accuracy_score
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Load data

In [105]:
df = pd.read_csv("data/semiyearly_chosen_columns.csv").iloc[:,1:]
df

Unnamed: 0,date,district,total population,Under-Five Population,GAM,MAM,SAM,GAM Prevalence,SAM Prevalence,phase3plus_perc_x,...,Total alarms,n_conflict_total,Average of centy,Average of centx,prevalence_6lag,next_prevalence,month,increase,increase_numeric,district_encoded
0,2017-07-01,Adan Yabaal,65262.96000,13052.59200,4819.01697,3733.04131,1085.97565,0.36920,0.08320,0.18000,...,2.16667,,3.54944,46.54467,,0.35100,7,False,-0.01820,0
1,2017-07-01,Lughaye,70268.22000,14053.64400,5334.76326,4220.30929,1114.45397,0.37960,0.07930,0.36000,...,2.66667,1.00000,10.64738,43.57812,,0.16900,7,False,-0.21060,58
2,2017-07-01,Buuhoodle,71317.71000,14263.54200,4858.16241,3652.89311,1205.26930,0.34060,0.08450,0.37000,...,2.33333,2.50000,8.46016,46.66129,,0.20280,7,False,-0.13780,23
3,2017-07-01,Luuq,100476.76500,20095.35300,8673.15435,7366.95641,1306.19795,0.43160,0.06500,0.21000,...,7.83333,1.50000,3.79293,42.69760,,0.39260,7,False,-0.03900,59
4,2017-07-01,Burtinle,112734.27000,22546.85400,10200.19675,8500.16396,1700.03279,0.45240,0.07540,0.22000,...,3.66667,,7.80220,48.39912,,0.37960,7,False,-0.07280,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,2021-07-01,Jariiban,,32671.60000,10890.00000,,1430.00000,0.33332,0.04377,0.19000,...,2.16667,,7.16378,48.99860,0.34857,,7,,,50
658,2021-07-01,Caluula,,16168.60000,5560.00000,,870.00000,0.34388,0.05381,0.16000,...,3.16667,1.00000,11.66822,50.79402,0.35925,,7,,,28
659,2021-07-01,Qoryooley,,25309.00087,11420.00000,,2160.00000,0.45122,0.08535,0.08000,...,6.16667,3.50000,1.93456,44.44943,0.45939,,7,,,65
660,2021-07-01,Baki,,11187.80000,3470.00000,,640.00000,0.31016,0.05721,0.37000,...,0.66667,1.00000,10.28566,43.73210,0.22769,,7,,,7


# Create train and test sets

In [106]:
y = df.next_prevalence.dropna()
X = df.select_dtypes(exclude=["object", "category"]).iloc[:len(y)].drop("next_prevalence", axis=1)

# Subsets

In [107]:
# Function that returns every possible subset (except the empty set) of the input list l
def subsets(l: object) -> object:
    subset_list = []
    for i in range(len(l) + 1):
        for j in range(i):
            subset_list.append(l[j: i])
    return subset_list

# Cross Validation Training

In [108]:
73*5

365

In [109]:
# Define search space for number of trees in random forest and depth of trees
num_trees_min = 31
num_trees_max = 64

depth_min = 2
depth_max = 7

parameter_scores = []

for num_trees in tqdm(range(num_trees_min, num_trees_max)):

    for depth in range(depth_min, depth_max):

        # Investigate every subset of explanatory variables
        for features in subsets(X.columns):
            # First CV split. The 99 refers to the first 3 observations for the 33 districts in the data.
            Xtrain = X[:219][features].copy().values
            ytrain = y[:219]
            Xtest = X[219:292][features].copy().values
            ytest = y[219:292]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = HistGradientBoostingRegressor(max_leaf_nodes=num_trees, max_depth=depth, random_state=0)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE1 = mean_absolute_error(ytest, predictions)

            # Second CV split. The 132 refers to the first 4 observations for the 33 districts in the data.
            Xtrain = X[:292][features].copy().values
            ytrain = y[:292]
            Xtest = X[292:365][features].copy().values
            ytest = y[292:365]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = HistGradientBoostingRegressor(max_leaf_nodes=num_trees, max_depth=depth, random_state=0)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE2 = mean_absolute_error(ytest, predictions)

            # Calculate the mean MAE over the two folds
            mean_MAE = (MAE1 + MAE2) / 2

            # Store the mean MAE together with the used hyperparameters in list
            parameter_scores.append((mean_MAE, num_trees, depth, features))

# Sort the models based on score and retrieve the hyperparameters of the best model
parameter_scores.sort(key=lambda x: x[0])
best_model_score = parameter_scores[0][0]
best_model_trees = parameter_scores[0][1]
best_model_depth = parameter_scores[0][2]
best_model_columns = list(parameter_scores[0][3])

'''------------SECTION FINAL EVALUATION--------------'''
y = df['next_prevalence'].values
X = df[best_model_columns].values

# If there is only one explanatory variable, the values need to be reshaped for the model
if len(best_model_columns) == 1:
    X = X.reshape(-1, 1)

# Peform evaluation on full data
Xtrain = X[:365]
ytrain = y[:365]
Xtest = X[365:]
ytest = y[365:]

clf = HistGradientBoostingRegressor(max_leaf_nodes=best_model_trees, max_depth=best_model_depth, random_state=0, verbose=1)
clf.fit(Xtrain, ytrain)
predictions = clf.predict(Xtest)

100%|██████████| 33/33 [1:49:25<00:00, 198.96s/it]

Binning 0.000 GB of training data: 0.006 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 12 leaves, max depth = 6, in 0.004s
[2/100] 1 tree, 13 leaves, max depth = 6, in 0.004s
[3/100] 1 tree, 13 leaves, max depth = 6, in 0.005s
[4/100] 1 tree, 13 leaves, max depth = 6, in 0.004s
[5/100] 1 tree, 13 leaves, max depth = 6, in 0.005s
[6/100] 1 tree, 15 leaves, max depth = 6, in 0.006s
[7/100] 1 tree, 12 leaves, max depth = 6, in 0.005s
[8/100] 1 tree, 15 leaves, max depth = 5, in 0.004s
[9/100] 1 tree, 14 leaves, max depth = 6, in 0.004s
[10/100] 1 tree, 13 leaves, max depth = 6, in 0.004s
[11/100] 1 tree, 14 leaves, max depth = 6, in 0.006s
[12/100] 1 tree, 13 leaves, max depth = 6, in 0.004s
[13/100] 1 tree, 14 leaves, max depth = 6, in 0.005s
[14/100] 1 tree, 12 leaves, max depth = 6, in 0.005s
[15/100] 1 tree, 9 leaves, max depth = 6, in 0.004s
[16/100] 1 tree, 8 leaves, max depth = 6, in 0.004s
[17/100] 1 tree, 13 leaves, max depth = 6, in 0.006s
[18/100] 1 tree, 11 leaves, max de




1 tree, 13 leaves, max depth = 6, in 0.007s
[35/100] 1 tree, 14 leaves, max depth = 6, in 0.005s
[36/100] 1 tree, 8 leaves, max depth = 6, in 0.003s
[37/100] 1 tree, 9 leaves, max depth = 6, in 0.004s
[38/100] 1 tree, 9 leaves, max depth = 6, in 0.004s
[39/100] 1 tree, 14 leaves, max depth = 6, in 0.005s
[40/100] 1 tree, 7 leaves, max depth = 6, in 0.003s
[41/100] 1 tree, 9 leaves, max depth = 6, in 0.003s
[42/100] 1 tree, 13 leaves, max depth = 6, in 0.004s
[43/100] 1 tree, 8 leaves, max depth = 6, in 0.004s
[44/100] 1 tree, 15 leaves, max depth = 6, in 0.004s
[45/100] 1 tree, 7 leaves, max depth = 6, in 0.003s
[46/100] 1 tree, 12 leaves, max depth = 6, in 0.006s
[47/100] 1 tree, 10 leaves, max depth = 6, in 0.005s
[48/100] 1 tree, 12 leaves, max depth = 6, in 0.004s
[49/100] 1 tree, 8 leaves, max depth = 6, in 0.004s
[50/100] 1 tree, 13 leaves, max depth = 6, in 0.005s
[51/100] 1 tree, 11 leaves, max depth = 6, in 0.004s
[52/100] 1 tree, 8 leaves, max depth = 6, in 0.004s
[53/100] 1 

In [112]:
# Calculate MAE
y_true = pd.Series(ytest[:-73]).drop(69)
y_pred = pd.Series(predictions[:-73]).drop(69)
#MAE = mean_absolute_error(ytest, predictions)
MAE = mean_absolute_error(y_true, y_pred)

# Generate boolean values for increase or decrease in prevalence. 0 if next prevalence is smaller than current prevalence, 1 otherwise.
increase = [0 if x < y else 1 for x in df.iloc[365:]['next_prevalence'] for y in df.iloc[365:]['GAM Prevalence']]
predicted_increase = [0 if x < y else 1 for x in predictions for y in df.iloc[365:]['GAM Prevalence']]

len(increase), len(predicted_increase)

(88209, 88209)

In [113]:
# Calculate accuracy of predicted boolean increase/decrease
acc = accuracy_score(increase, predicted_increase)

# Print model parameters
print('no. of trees: ' + str(best_model_trees) + '\nmax_depth: ' + str(best_model_depth) + '\ncolumns: ' + str(
    best_model_columns))

# Print model scores
print(MAE, acc)

no. of trees: 31
max_depth: 6
columns: ['total population', 'Under-Five Population', 'GAM', 'MAM', 'SAM', 'GAM Prevalence', 'SAM Prevalence', 'phase3plus_perc_x', 'rainfall', 'ndvi_score', 'Price of water', 'Total alarms', 'n_conflict_total', 'Average of centy', 'Average of centx', 'prevalence_6lag', 'month', 'increase_numeric']
0.021789393055805115 0.8224217483476742
