In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

In [2]:
# Load in data
filepath = "C:/Users/benja/Documents/ACLED/processed_data/rf_data.csv"
df = pd.read_csv(filepath)
# df.dtypes

In [3]:
# Week start -> datetime
df['week_start'] = pd.to_datetime(df['week_start'], format='%Y-%m-%d')

# Optimize data types
df['country'] = df['country'].astype('category')
df['admin1'] = df['admin1'].astype('category')

# Convert float64 to float32
float64_cols = df.select_dtypes(include='float64').columns
df[float64_cols] = df[float64_cols].astype('float32')

In [4]:
# Fill NaNs only in numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Get dummies for countires and admin1
dummies_df = pd.get_dummies(df, columns=['country', 'admin1'])

# Train/test split based on last 6 months
last_date = dummies_df['week_start'].max()
six_months_ago = last_date - relativedelta(months=6)
train = dummies_df[dummies_df['week_start'] < six_months_ago]
test = dummies_df[dummies_df['week_start'] >= six_months_ago]

In [5]:
### Correct init:

# Features to exclude
drop_cols = ['week_start']

# Target variables to predict
target_vars = [
    'fatalities',
    'count_battles',
    'count_protests',
    'count_riots',
    'count_explosions',
    'count_civ_violence'
]

# Store results
results = []

# Collect metrics for all variables
metrics = []

In [6]:
### RF loop for all features of interest

for target in target_vars:
    start = time.time()

    X_train = train.drop(columns=drop_cols + target_vars)
    y_train = train[target]
    X_test = test.drop(columns=drop_cols + target_vars)
    y_test = test[target]

    # Train RF
    model = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=9, n_jobs=-1) # "n_jobs=-1" uses all CPU cores--essential optimization
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Save predictions
    test_result = test[['week_start']].copy()
    test_result['true'] = y_test.values
    test_result['predicted'] = y_pred
    test_result['target'] = target
    results.append(test_result)

    metrics.append({
        'target_variable': target,
        'MAE': round(mae, 4),
        'RMSE': round(rmse, 4),
        'R2': round(r2, 4)
    })

    print(f"Trained and evaluated RF for {target} in {round(time.time() - start, 2)} seconds")

# Save
all_preds = pd.concat(results, axis=0)
all_preds.to_csv("rf_predictions_last_6mo.csv", index=False)

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("rf_metrics_last_6mo.csv", index=False)

print("Saved all RF predictions and metrics")

Trained and evaluated RF for fatalities in 2895.23 seconds
Trained and evaluated RF for count_battles in 2868.17 seconds
Trained and evaluated RF for count_protests in 2814.78 seconds
Trained and evaluated RF for count_riots in 2971.94 seconds
Trained and evaluated RF for count_explosions in 2998.32 seconds
Trained and evaluated RF for count_civ_violence in 2826.41 seconds
Saved all RF predictions and metrics
