## Imports

In [81]:
"""------------SECTION IMPORTS---------------------"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")
import tqdm
'''------------SECTION USER VARIABLES--------------'''
# Define the path to your datafolder below
your_datapath = '../data/'

# Define search space for number of trees in random forest and depth of trees
num_trees_min = 64
num_trees_max = 128

depth_min = 2
depth_max = 7

# Useful functions

In [44]:
# Function that returns every possible subset (except the empty set) of the input list l
def subsets(l: object) -> object:
    subset_list = []
    for i in range(len(l) + 1):
        for j in range(i):
            subset_list.append(l[j: i])
    return subset_list

# Import data
NaN data needs to be dropped for RF to work
1. I drop the columns which have NaN values which is all the new data
2. I create a new dataframe which has no columns containing NaN values
3. Copy the `next_prevalence` column to this dataframe and once again drop rows containing NaN

In [95]:
df = pd.read_csv("semiyearly_data.csv")
df2 = df.dropna(axis=1)
df2["next_prevalence"]= df["next_prevalence"]
df2 = df2.dropna(axis=0)

In [96]:
df2.isna().sum()

Unnamed: 0               0
Unnamed: 0.2             0
Unnamed: 0.1_x           0
Unnamed: 0_x             0
date                     0
district_x               0
Under-Five Population    0
GAM                      0
SAM                      0
GAM Prevalence           0
SAM Prevalence           0
SAM/GAM ratio            0
district                 0
month                    0
district_encoded         0
next_prevalence          0
dtype: int64

In [97]:
# Inspect the dtypes of each column
df2.dtypes

Unnamed: 0                 int64
Unnamed: 0.2               int64
Unnamed: 0.1_x             int64
Unnamed: 0_x               int64
date                      object
district_x                object
Under-Five Population    float64
GAM                      float64
SAM                      float64
GAM Prevalence           float64
SAM Prevalence           float64
SAM/GAM ratio            float64
district                  object
month                      int64
district_encoded           int64
next_prevalence          float64
dtype: object

In [98]:
df = df2.copy()

In [99]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 0.2,Unnamed: 0.1_x,Unnamed: 0_x,date,district_x,Under-Five Population,GAM,SAM,GAM Prevalence,SAM Prevalence,SAM/GAM ratio,district,month,district_encoded,next_prevalence
0,0,605,605,605,2017-07-01,Adan Yabaal,13052.59200,4819.01697,1085.97565,0.36920,0.08320,0.22535,Adan Yabaal,7,0,0.35100
1,1,656,656,656,2017-07-01,Lughaye,14053.64400,5334.76326,1114.45397,0.37960,0.07930,0.20890,Lughaye,7,58,0.16900
2,2,624,624,624,2017-07-01,Buuhoodle,14263.54200,4858.16241,1205.26930,0.34060,0.08450,0.24809,Buuhoodle,7,23,0.20280
3,3,657,657,657,2017-07-01,Luuq,20095.35300,8673.15435,1306.19795,0.43160,0.06500,0.15060,Luuq,7,59,0.39260
4,4,623,623,623,2017-07-01,Burtinle,22546.85400,10200.19675,1700.03279,0.45240,0.07540,0.16667,Burtinle,7,22,0.37960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,599,96,96,96,2021-01-01,Qandala,10505.00000,3730.00000,600.00000,0.35507,0.05712,0.16086,Qandala,1,62,0.33217
600,600,102,102,102,2021-01-01,Galdogob,14515.00000,6560.00000,1330.00000,0.45195,0.09163,0.20274,Galdogob,1,41,0.45734
601,601,122,122,122,2021-01-01,Qoryooley,41555.00000,19090.00000,3135.00000,0.45939,0.07544,0.16422,Qoryooley,1,65,0.45122
602,602,127,127,127,2021-01-01,Diinsoor,27915.00000,13745.00000,2235.00000,0.49239,0.08006,0.16260,Diinsoor,1,37,0.41964


## RF-CV

In [100]:
'''------------SECTION RANDOM FOREST CROSS VALIDATION--------------'''
# WARNING: this process can take some time, since there are a lot of hyperparameters to investigate. The search space can be manually reduced to speed up the process.

# Create empty list to store model scores
parameter_scores = []

# Define target and explanatory variables
X = df.select_dtypes(exclude=["category","object"])
y = df['next_prevalence'].values

In [101]:
X.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0.2,Unnamed: 0.1_x,Unnamed: 0_x,Under-Five Population,GAM,SAM,GAM Prevalence,SAM Prevalence,SAM/GAM ratio,month,district_encoded,next_prevalence
0,0,605,605,605,13052.592,4819.01697,1085.97565,0.3692,0.0832,0.22535,7,0,0.351
1,1,656,656,656,14053.644,5334.76326,1114.45397,0.3796,0.0793,0.2089,7,58,0.169
2,2,624,624,624,14263.542,4858.16241,1205.2693,0.3406,0.0845,0.24809,7,23,0.2028
3,3,657,657,657,20095.353,8673.15435,1306.19795,0.4316,0.065,0.1506,7,59,0.3926
4,4,623,623,623,22546.854,10200.19675,1700.03279,0.4524,0.0754,0.16667,7,22,0.3796


In [102]:
y

array([0.351     , 0.169     , 0.2028    , 0.3926    , 0.3796    ,
       0.351     , 0.2028    , 0.3926    , 0.3432    , 0.3926    ,
       0.2886    , 0.3432    , 0.3432    , 0.169     , 0.351     ,
       0.1976    , 0.4862    , 0.3926    , 0.3926    , 0.3926    ,
       0.2028    , 0.2912    , 0.351     , 0.3692    , 0.4862    ,
       0.3432    , 0.4862    , 0.2912    , 0.2886    , 0.351     ,
       0.2912    , 0.351     , 0.3796    , 0.4004    , 0.3926    ,
       0.3926    , 0.2886    , 0.3796    , 0.4212    , 0.1976    ,
       0.3367    , 0.1976    , 0.3926    , 0.4004    , 0.4212    ,
       0.3432    , 0.4212    , 0.3926    , 0.4862    , 0.3926    ,
       0.3367    , 0.4004    , 0.2912    , 0.3926    , 0.3432    ,
       0.351     , 0.351     , 0.3926    , 0.4212    , 0.4212    ,
       0.351     , 0.2028    , 0.4004    , 0.169     , 0.169     ,
       0.3367    , 0.3926    , 0.4862    , 0.351     , 0.4004    ,
       0.3926    , 0.3926    , 0.2886    , 0.2912    , 0.39714

In [None]:


for num_trees in range(num_trees_min, num_trees_max):

    for depth in range(depth_min, depth_max):

        # Investigate every subset of explanatory variables
        for features in subsets(X.columns):
            # First CV split. The 99 refers to the first 3 observations for the 33 districts in the data.
            Xtrain = X[:99][features].copy().values
            ytrain = y[:99]
            Xtest = X[99:132][features].copy().values
            ytest = y[99:132]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0, n_jobs=-1)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE1 = mean_absolute_error(ytest, predictions)

            # Second CV split. The 132 refers to the first 4 observations for the 33 districts in the data.
            Xtrain = X[:132][features].copy().values
            ytrain = y[:132]
            Xtest = X[132:165][features].copy().values
            ytest = y[132:165]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0, n_jobs=-1)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE2 = mean_absolute_error(ytest, predictions)

            # Calculate the mean MAE over the two folds
            mean_MAE = (MAE1 + MAE2) / 2

            # Store the mean MAE together with the used hyperparameters in list
            parameter_scores.append((mean_MAE, num_trees, depth, features))

# Sort the models based on score and retrieve the hyperparameters of the best model
parameter_scores.sort(key=lambda x: x[0])
best_model_score = parameter_scores[0][0]
best_model_trees = parameter_scores[0][1]
best_model_depth = parameter_scores[0][2]
best_model_columns = list(parameter_scores[0][3])

'''------------SECTION FINAL EVALUATION--------------'''
X = df[best_model_columns].values
y = df['next_prevalence'].values

# If there is only one explanatory variable, the values need to be reshaped for the model
if len(best_model_columns) == 1:
    X = X.reshape(-1, 1)

# Peform evaluation on full data
Xtrain = X[:300]
ytrain = y[:300]
Xtest = X[300:]
ytest = y[300:]

clf = RandomForestRegressor(n_estimators=best_model_trees, max_depth=best_model_depth, random_state=0, n_jobs=-1)
clf.fit(Xtrain, ytrain)
predictions = clf.predict(Xtest)



## Fix `prevalence` on line 5 in the next code block

In [None]:
# Calculate MAE
MAE = mean_absolute_error(ytest, predictions)

# Generate boolean values for increase or decrease in prevalence. 0 if next prevalence is smaller than current prevalence, 1 otherwise.
increase = [0 if x < y else 1 for x in df.iloc[165:]['next_prevalence'] for y in df.iloc[165:]['prevalence']]
predicted_increase = [0 if x < y else 1 for x in predictions for y in df.iloc[165:]['prevalence']]

# Calculate accuracy of predicted boolean increase/decrease
acc = accuracy_score(increase, predicted_increase)

# Print model parameters
print('no. of trees: ' + str(best_model_trees) + '\nmax_depth: ' + str(best_model_depth) + '\ncolumns: ' + str(
    best_model_columns))

# Print model scores
print(MAE, acc)

# Save model

In [None]:
filename = 'baseline_semiyearly_model.joblib'
joblib.dump(clf, filename)

# Metrics

In [None]:
from helper_metrics import make_confusion_matrix, roc_curve_gen, calculate_results

In [None]:
make_confusion_matrix(increase, predicted_increase,figsize=(6,6),classes=["Increase", "Decrease"])

In [None]:
roc_curve_gen(increase, predicted_increase)

In [None]:
calculate_results(increase, predicted_increase, average="weighted")