# Imports

In [None]:
"""------------SECTION IMPORTS---------------------"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
import joblib
import warnings
from tqdm import tqdm
import matplotlib.pyplot as plt
import shap
import datacompy
warnings.filterwarnings("ignore")
# Define search space for number of trees in random forest and depth of trees
num_trees_min = 64
num_trees_max = 128

depth_min = 2
depth_max = 7

# Load data

In [None]:
df = pd.read_csv("data/no_missings_sy.csv")
df = df.drop("Unnamed: 0",axis=1)
df.head()

In [14]:
'''------------SECTION RANDOM FOREST CROSS VALIDATION--------------'''
# WARNING: this process can take some time, since there are a lot of hyperparameters to investigate. The search space can be manually reduced to speed up the process.

# Create empty list to store model scores
parameter_scores = []

# Define target and explanatory variables
X = df.select_dtypes(exclude=["category","object"]).drop(["increase", "prevalence", "next_prevalence"],axis=1)
y = df['next_prevalence'].values

# Explore differences in districts in baseline vs preprocessed data

In [15]:
baseline_data = pd.read_csv("data/baseline_data.csv")

# Get districts
baseline_districts  = np.sort(baseline_data.district.unique())
preprocessed_districts = np.sort(df.district.unique())

# Compare districts
new_districts  = []
for d in preprocessed_districts:
    if d not in baseline_districts:
        new_districts.append(d)

In [16]:
len(new_districts)

36

In [17]:
import plotly.express as px
import geopandas as gpd

In [18]:
# Load London Boroughs shapefile to show borders on map
LB_file = 'SOM.shp' # Replace this with local filepath/name
gdf = gpd.read_file(LB_file) # Read file into a geodataframe

# Plot the geodataframe

gdf.boundary.plot(figsize=(15,15));

ImportError: the 'read_file' function requires the 'fiona' package, but it is not installed or does not import correctly.
Importing fiona resulted in: DLL load failed while importing ogrext: The specified module could not be found.

# Load model

In [None]:
loaded_model_sy = joblib.load("baseline_semiyearly_model.joblib")

In [None]:
loaded_model_sy

# Evaluate model

In [None]:
train_split = 345

In [None]:
# Perform evaluation on full data
Xtrain = X[:train_split]
ytrain = y[:train_split]
Xtest = X[train_split:]
ytest = y[train_split:]

In [None]:
loaded_model_sy.fit(Xtrain, ytrain)
predictions = loaded_model_sy.predict(Xtest)

In [None]:
# Feature Importance
plt.figure(figsize=(10,6))
plt.title("Feature Importances on Preprocessed Data", size=14)
plt.barh(X.columns, loaded_model_sy.feature_importances_)
plt.xlabel("Importance")
plt.ylabel("Features")
plt.legend(["Baseline Model"]);

In [None]:
explainer = shap.TreeExplainer(loaded_model_sy)
shap_values = explainer.shap_values(Xtest)
shap.summary_plot(shap_values, Xtest, plot_size=(10,7))

# Visualise predictions
Each data point on the graph is a single district so these graphs are not informative

In [None]:
len(ytest), len(predictions)

In [None]:
from helper_metrics import plot_time_series

In [None]:
from datetime import datetime
timesteps = [datetime.strptime(i, "%Y-%m-%d") for i in df.date]

In [None]:
plt.figure(figsize=(20,7))
plt.title("Prevalence estimates from all districts for semiyearly data from 2020-7 to 2021-1",size=16)
plot_time_series(timesteps=np.arange(138),values=ytest, label="Prevalence")
plot_time_series(timesteps=np.arange(138),values=predictions, label="Predicted Prevalence")
plt.xticks(np.arange(0,160,step=20),labels=["2020-7", "2020-7", "2020-7", "2020-7", "2021-1","2021-1", "2021-1","2021-1"]);

# Naive Forecast

In [None]:
naive_forecast = ytest[:-1]
plt.figure(figsize=(20,7))
plt.title("Naive prevalence estimates from all districts for semiyearly data from 2020-7 to 2021-1",size=16)
plot_time_series(timesteps=np.arange(137),values=ytest[1:], label="Prevalence")
plot_time_series(timesteps=np.arange(137),values=naive_forecast, label="Naive Forecast of Prevalence")
plt.xticks(np.arange(0,160,step=20),labels=["2020-7", "2020-7", "2020-7", "2020-7", "2021-1","2021-1", "2021-1","2021-1"]);

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# MAE
mean_absolute_error(y_true=ytest[1:], y_pred=naive_forecast)

In [None]:
# MSE
mean_squared_error(y_true=ytest[1:], y_pred=naive_forecast)

In [None]:
# RMSE
np.sqrt(mean_squared_error(y_true=ytest[1:], y_pred=naive_forecast))