## Imports

In [None]:
"""------------SECTION IMPORTS---------------------"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

'''------------SECTION USER VARIABLES--------------'''
# Define the path to your datafolder below
your_datapath = '../data/'

# Define search space for number of trees in random forest and depth of trees
num_trees_min = 64
num_trees_max = 128

depth_min = 2
depth_max = 7

# Useful functions

In [None]:
# Function that returns every possible subset (except the empty set) of the input list l
def subsets(l: object) -> object:
    subset_list = []
    for i in range(len(l) + 1):
        for j in range(i):
            subset_list.append(l[j: i])
    return subset_list

# Import data
NaN data needs to be dropped for RF to work
1. I drop the columns which have NaN values which is all the new data
2. I create a new dataframe which has no columns containing NaN values
3. Copy the `next_prevalence` column to this dataframe and once again drop rows containing NaN

In [None]:
df = pd.read_csv("data/semiyearly_data.csv")
df2 = df.dropna(axis=1)
df2["next_prevalence"]= df["next_prevalence"]
df2 = df2.dropna(axis=0)

In [None]:
df2.isna().sum()

In [None]:
# Inspect the dtypes of each column
df2.dtypes

In [None]:
df = df2.copy()

In [None]:
df

## RF-CV

In [None]:
'''------------SECTION RANDOM FOREST CROSS VALIDATION--------------'''
# WARNING: this process can take some time, since there are a lot of hyperparameters to investigate. The search space can be manually reduced to speed up the process.

# Create empty list to store model scores
parameter_scores = []

# Define target and explanatory variables
X = df.select_dtypes(exclude=["category","object"])
y = df['next_prevalence'].values

In [None]:
X.head()

In [None]:
y

In [None]:


for num_trees in range(num_trees_min, num_trees_max):

    for depth in range(depth_min, depth_max):

        # Investigate every subset of explanatory variables
        # noinspection PyTypeChecker
        for features in tqdm(subsets(X.columns)):
            # First CV split. The 99 refers to the first 3 observations for the 33 districts in the data.
            Xtrain = X[:99][features].copy().values
            ytrain = y[:99]
            Xtest = X[99:132][features].copy().values
            ytest = y[99:132]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0, n_jobs=-1)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE1 = mean_absolute_error(ytest, predictions)

            # Second CV split. The 132 refers to the first 4 observations for the 33 districts in the data.
            Xtrain = X[:132][features].copy().values
            ytrain = y[:132]
            Xtest = X[132:165][features].copy().values
            ytest = y[132:165]

            # Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0, n_jobs=-1)

            # Fit to the training data
            clf.fit(Xtrain, ytrain)

            # Make a prediction on the test data
            predictions = clf.predict(Xtest)

            # Calculate mean absolute error
            MAE2 = mean_absolute_error(ytest, predictions)

            # Calculate the mean MAE over the two folds
            mean_MAE = (MAE1 + MAE2) / 2

            # Store the mean MAE together with the used hyperparameters in list
            parameter_scores.append((mean_MAE, num_trees, depth, features))

# Sort the models based on score and retrieve the hyperparameters of the best model
parameter_scores.sort(key=lambda x: x[0])
best_model_score = parameter_scores[0][0]
best_model_trees = parameter_scores[0][1]
best_model_depth = parameter_scores[0][2]
best_model_columns = list(parameter_scores[0][3])

'''------------SECTION FINAL EVALUATION--------------'''
X = df[best_model_columns].values
y = df['next_prevalence'].values

# If there is only one explanatory variable, the values need to be reshaped for the model
if len(best_model_columns) == 1:
    X = X.reshape(-1, 1)

# Peform evaluation on full data
Xtrain = X[:300]
ytrain = y[:300]
Xtest = X[300:]
ytest = y[300:]

clf = RandomForestRegressor(n_estimators=best_model_trees, max_depth=best_model_depth, random_state=0, n_jobs=-1)
clf.fit(Xtrain, ytrain)
predictions = clf.predict(Xtest)

len(predictions)

## Fix `prevalence` on line 5 in the next code block

In [None]:
# Calculate MAE
MAE = mean_absolute_error(ytest, predictions)

# Generate boolean values for increase or decrease in prevalence. 0 if next prevalence is smaller than current prevalence, 1 otherwise.
increase = [0 if x < y else 1 for x in df['next_prevalence'] for y in df['GAM Prevalence']]
predicted_increase = [0 if x < y else 1 for x in predictions for y in df['GAM Prevalence']]


In [None]:
# Calculate accuracy of predicted boolean increase/decrease
acc = accuracy_score(increase, predicted_increase)

# Print model parameters
print('no. of trees: ' + str(best_model_trees) + '\nmax_depth: ' + str(best_model_depth) + '\ncolumns: ' + str(
    best_model_columns))

# Print model scores
print(MAE, acc)

# Save model

In [None]:
filename = 'baseline_semiyearly_model.joblib'
joblib.dump(clf, filename)

# Metrics

In [None]:
from helper_metrics import make_confusion_matrix, roc_curve_gen, calculate_results

In [None]:
make_confusion_matrix(increase, predicted_increase,figsize=(6,6),classes=["Increase", "Decrease"])

In [None]:
roc_curve_gen(increase, predicted_increase)

In [None]:
calculate_results(increase, predicted_increase, average="weighted")