## Imports

In [48]:
"""------------SECTION IMPORTS---------------------"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
import joblib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import tqdm
'''------------SECTION USER VARIABLES--------------'''
# Define the path to your datafolder below
your_datapath = '../data/'

# Define search space for number of trees in random forest and depth of trees
num_trees_min = 64
num_trees_max = 128

depth_min = 2
depth_max = 7

# Useful functions

In [49]:
# Function that returns every possible subset (except the empty set) of the input list l
def subsets(l: object) -> object:
    subset_list = []
    for i in range(len(l) + 1):
        for j in range(i):
            subset_list.append(l[j: i])
    return subset_list

# Import data

In [50]:
df = pd.read_csv("monthly_data.csv")
df2 = df.dropna(axis=1)
df2["next_prevalence"]= df["next_prevalence"]
df2 = df2.dropna(axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["next_prevalence"]= df["next_prevalence"]


In [51]:
df = df2.copy()

In [52]:
df.isna().sum()

Unnamed: 0                 0
Unnamed: 0.3_x             0
date                       0
Unnamed: 0.2_x             0
Unnamed: 0.1_x             0
Unnamed: 0_x               0
district_x                 0
Under-Five Population_x    0
GAM_x                      0
SAM_x                      0
GAM Prevalence_x           0
SAM Prevalence_x           0
SAM/GAM ratio_x            0
Unnamed: 0.3_y             0
Unnamed: 0.2_y             0
Unnamed: 0.1_y             0
Unnamed: 0_y               0
district_y                 0
Under-Five Population_y    0
GAM_y                      0
SAM_y                      0
GAM Prevalence_y           0
SAM Prevalence_y           0
SAM/GAM ratio_y            0
Unnamed: 0.3               0
Unnamed: 0.2               0
Unnamed: 0.1_x.1           0
Unnamed: 0_x.1             0
district_x.1               0
Under-Five Population      0
GAM                        0
SAM                        0
GAM Prevalence             0
SAM Prevalence             0
SAM/GAM ratio 

In [53]:
df

Unnamed: 0.1,Unnamed: 0,Unnamed: 0.3_x,date,Unnamed: 0.2_x,Unnamed: 0.1_x,Unnamed: 0_x,district_x,Under-Five Population_x,GAM_x,SAM_x,...,Average of centx,Unnamed: 0.1_x.2,Unnamed: 0_x.2,district_x.2,rainfall,Total alarms,month,district,district_encoded,next_prevalence
0,0,0,2017-07-01,603,603,603,Cabudwaaq,14666.157000,7626.401640,1754.072377,...,46.12500,3955,3955,Cabudwaaq,3.30,4,7,Cabudwaaq,22,0.520000
1,1,2316,2017-07-01,661,661,661,Qandala,15164.613000,6150.767033,1025.127839,...,50.08869,3944,3944,Qandala,0.86,1,7,Qandala,58,0.405600
2,2,2910,2017-07-01,641,641,641,Garbahaarey,18274.995000,7887.487842,1187.874675,...,42.35913,3983,3983,Garbahaarey,6.70,2,7,Garbahaarey,39,0.431600
3,3,3612,2017-07-01,659,659,659,Banadir,415454.789000,161739.176773,37542.259279,...,45.42547,3966,3966,Banadir,40.37,14,7,Banadir,8,0.389306
4,4,1026,2017-07-01,637,637,637,Doolow,10046.740000,4376.618584,677.305460,...,42.14080,3987,3987,Doolow,0.28,1,7,Doolow,35,0.435626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3966,3966,372,2026-01-01,10,10,10,Baraawe,10954.968728,5100.000000,1060.000000,...,43.52054,28,28,Baraawe,2.57,6,1,Baraawe,10,0.465542
3967,3967,373,2026-02-01,10,10,10,Baraawe,10954.968728,5100.000000,1060.000000,...,43.52054,28,28,Baraawe,2.57,6,2,Baraawe,10,0.465542
3968,3968,374,2026-03-01,10,10,10,Baraawe,10954.968728,5100.000000,1060.000000,...,43.52054,28,28,Baraawe,2.57,6,3,Baraawe,10,0.465542
3969,3969,375,2026-04-01,10,10,10,Baraawe,10954.968728,5100.000000,1060.000000,...,43.52054,28,28,Baraawe,2.57,6,4,Baraawe,10,0.465542


## RF-CV

In [54]:
'''------------SECTION RANDOM FOREST CROSS VALIDATION--------------'''
# WARNING: this process can take some time, since there are a lot of hyperparameters to investigate. The search space can be manually reduced to speed up the process.

# Create empty list to store model scores
parameter_scores = []

# Define target and explanatory variables
X = df.select_dtypes(exclude=["category","object"])
y = df['next_prevalence'].values

In [55]:
X.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0.3_x,Unnamed: 0.2_x,Unnamed: 0.1_x,Unnamed: 0_x,Under-Five Population_x,GAM_x,SAM_x,GAM Prevalence_x,SAM Prevalence_x,...,Unnamed: 0_y.1,Average of centy,Average of centx,Unnamed: 0.1_x.2,Unnamed: 0_x.2,rainfall,Total alarms,month,district_encoded,next_prevalence
0,0,0,603,603,603,14666.157,7626.40164,1754.072377,0.52,0.1196,...,1095,5.826085,46.125,3955,3955,3.3,4,7,22,0.52
1,1,2316,661,661,661,15164.613,6150.767033,1025.127839,0.4056,0.0676,...,2859,11.06016,50.08869,3944,3944,0.86,1,7,58,0.4056
2,2,2910,641,641,641,18274.995,7887.487842,1187.874675,0.4316,0.065,...,1928,3.205999,42.35913,3983,3983,6.7,2,7,39,0.4316
3,3,3612,659,659,659,415454.789,161739.176773,37542.259279,0.389306,0.090364,...,360,2.110552,45.42547,3966,3966,40.37,14,7,8,0.389306
4,4,1026,637,637,637,10046.74,4376.618584,677.30546,0.435626,0.067415,...,1732,4.002533,42.1408,3987,3987,0.28,1,7,35,0.435626


In [56]:
y

array([0.52      , 0.4056    , 0.4316    , ..., 0.46554218, 0.46554218,
       0.46554218])

In [None]:


for num_trees in range(num_trees_min, num_trees_max):

    for depth in range(depth_min, depth_max):

        # Investigate every subset of explanatory variables
        for features in subsets(X.columns):
            # First CV split. The 99 refers to the first 3 observations for the 33 districts in the data.
            Xtrain = X[:99][features].copy().values
            ytrain = y[:99]
            Xtest = X[99:132][features].copy().values
            ytest = y[99:132]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0, n_jobs=-1)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE1 = mean_absolute_error(ytest, predictions)

            # Second CV split. The 132 refers to the first 4 observations for the 33 districts in the data.
            Xtrain = X[:132][features].copy().values
            ytrain = y[:132]
            Xtest = X[132:165][features].copy().values
            ytest = y[132:165]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0, n_jobs=-1)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE2 = mean_absolute_error(ytest, predictions)

            # Calculate the mean MAE over the two folds
            mean_MAE = (MAE1 + MAE2) / 2

            # Store the mean MAE together with the used hyperparameters in list
            parameter_scores.append((mean_MAE, num_trees, depth, features))

# Sort the models based on score and retrieve the hyperparameters of the best model
parameter_scores.sort(key=lambda x: x[0])
best_model_score = parameter_scores[0][0]
best_model_trees = parameter_scores[0][1]
best_model_depth = parameter_scores[0][2]
best_model_columns = list(parameter_scores[0][3])

'''------------SECTION FINAL EVALUATION--------------'''
X = df[best_model_columns].values
y = df['next_prevalence'].values

# If there is only one explanatory variable, the values need to be reshaped for the model
if len(best_model_columns) == 1:
    X = X.reshape(-1, 1)

# Peform evaluation on full data
Xtrain = X[:165]
ytrain = y[:165]
Xtest = X[165:]
ytest = y[165:]

clf = RandomForestRegressor(n_estimators=best_model_trees, max_depth=best_model_depth, random_state=0, n_jobs=-1)
clf.fit(Xtrain, ytrain)
predictions = clf.predict(Xtest)

# Calculate MAE
MAE = mean_absolute_error(ytest, predictions)

# Generate boolean values for increase or decrease in prevalence. 0 if next prevalence is smaller than current prevalence, 1 otherwise.
increase = [0 if x < y else 1 for x in df.iloc[165:]['next_prevalence'] for y in df.iloc[165:]['prevalence']]
predicted_increase = [0 if x < y else 1 for x in predictions for y in df.iloc[165:]['prevalence']]

# Calculate accuracy of predicted boolean increase/decrease
acc = accuracy_score(increase, predicted_increase)

# Print model parameters
print('no. of trees: ' + str(best_model_trees) + '\nmax_depth: ' + str(best_model_depth) + '\ncolumns: ' + str(
    best_model_columns))

# Print model scores
print(MAE, acc)

# Save model

In [None]:
filename = 'baseline_monthly_model.joblib'
joblib.dump(clf, filename)

# Metrics

In [None]:
from helper_metrics import make_confusion_matrix, roc_curve_gen, calculate_results

In [None]:
from helper_metrics import make_confusion_matrix

In [None]:
make_confusion_matrix(increase, predicted_increase,figsize=(6,6),classes=["Increase", "Decrease"],cmap=plt.cm.Greens)

In [None]:
roc_curve_gen(increase, predicted_increase)

In [None]:
calculate_results(increase, predicted_increase, average="weighted")