In [9]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error, r2_score #, root_mean_squared_error

In [2]:
# Load in data
df = pd.read_csv("data/rf_data.csv")
df.dtypes

country                       object
admin1                        object
week_start                    object
fatalities                   float64
population                   float64
count_battles                float64
count_protests               float64
count_riots                  float64
count_explosions             float64
count_civ_violence           float64
fatalities_t+26              float64
fatalities_lag_2             float64
count_battles_lag_2          float64
count_protests_lag_2         float64
count_riots_lag_2            float64
count_explosions_lag_2       float64
count_civ_violence_lag_2     float64
fatalities_lag_4             float64
count_battles_lag_4          float64
count_protests_lag_4         float64
count_riots_lag_4            float64
count_explosions_lag_4       float64
count_civ_violence_lag_4     float64
fatalities_lag_12            float64
count_battles_lag_12         float64
count_protests_lag_12        float64
count_riots_lag_12           float64
c

In [3]:
# Week start -> datetime
df['week_start'] = pd.to_datetime(df['week_start'], format='%Y-%m-%d')

# Convert float64 to 32
float64_cols = df.select_dtypes(include='float64').columns
df[float64_cols] = df[float64_cols].astype('float32')

In [4]:
# Fill missing values with zeros
df = df.fillna(0)

# Get dummies for countires and admin1
dummies_df = pd.get_dummies(df, columns=['country', 'admin1']) #.copy()

# Find the latest date in the DataFrame
last_date = dummies_df['week_start'].max()

# Calculate the date 6 months ago from the latest date
six_months_ago = last_date - relativedelta(months=6)

# Training
train = dummies_df[dummies_df['week_start'] < six_months_ago]

# Testing 
test = dummies_df[dummies_df['week_start'] >= six_months_ago]

train.head()

Unnamed: 0,week_start,fatalities,population,count_battles,count_protests,count_riots,count_explosions,count_civ_violence,fatalities_t+26,fatalities_lag_2,...,admin1_Zinder,admin1_Zlatibor,admin1_Zlin,admin1_Zombo,admin1_Zonguldak,admin1_Zou,admin1_Zug,admin1_Zuid-Holland,admin1_Zulia,admin1_Zurich
0,2016-01-04,0.0,14689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-11,0.0,14689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-18,0.0,14689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-25,0.0,14689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2016-02-01,0.0,14689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Has 0 population, why? (some are missing)
4306/df.shape[0]*100

0.2832775462085807

In [None]:
# Fatalities Trainig Data 
X_train = train.drop(columns=['week_start', 'fatalities'])
y_train = train['fatalities']

# Fiting the Random Forest model
regr = RandomForestRegressor(max_depth=10, random_state=9) # <-- hyper paramter tuning "if needed"
regr.fit(X_train, y_train)

# Fatalities Testing Data
X_test = test.drop(columns=['week_start', 'fatalities'])
y_test = test['fatalities']

# Predict most recent 6 months
y_pred_fatalities = regr.predict(X_test)

# Calculate MAE, RMSE, R^2
mae_fatalities = round(mean_absolute_error(y_test, y_pred_fatalities),2)
r_sq_fatalities = round(r2_score(y_test, y_pred_fatalities),2)

print(f'The MAE fatalities for the baseline Random Forest is {mae_fatalities}, and the R^2  is {r_sq_fatalities}')