# Random Forest

## Import Data

In [12]:
import numpy as np
import pandas as pd

train_df = pd.read_parquet("../data/train_non_lin_preprocessed.parquet")
print(train_df.shape)

X_train = train_df.drop(columns = ['target'])
y_train = train_df['target']

test_df = pd.read_parquet("../data/test_non_lin_preprocessed.parquet")
print(test_df.shape)

X_test = test_df.drop(columns = ['target'])
y_test = test_df['target']

(1076248, 80)
(269062, 80)


## Modelling

### Imbalanced Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
import matplotlib.pyplot as plt

target_col = "target"

X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

print("Test ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Test ROC AUC: 0.7207610799635567
              precision    recall  f1-score   support

           0       0.81      0.99      0.89    215350
           1       0.58      0.06      0.12     53712

    accuracy                           0.80    269062
   macro avg       0.69      0.53      0.50    269062
weighted avg       0.76      0.80      0.74    269062



In [16]:
def plot_feature_importances(rf):

    importances = rf.feature_importances_
    feature_names = X_train.columns

    # Sort by importance
    sorted_idx = importances.argsort()[::-1]

    plt.figure(figsize=(10,5))
    plt.bar(range(len(importances)), importances[sorted_idx])
    plt.xticks(range(len(importances)), feature_names[sorted_idx], rotation=90)
    plt.title("Feature Importance")
    plt.show()

plot_feature_importances(rf_model)

NameError: name 'plt' is not defined

### Balanced Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import precision_recall_curve, auc

In [14]:
rf_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=15,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

print("Test ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Test ROC AUC: 0.7181692997191498
              precision    recall  f1-score   support

           0       0.87      0.72      0.79    215350
           1       0.34      0.59      0.43     53712

    accuracy                           0.69    269062
   macro avg       0.61      0.65      0.61    269062
weighted avg       0.77      0.69      0.72    269062



In [17]:
plot_feature_importances(rf_model)

NameError: name 'plt' is not defined

In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)

PR AUC: 0.3848263435995586


Note that this gives a similar ROC AUC, but is much better on the positive class.

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, auc

In [19]:
# rf_model = RandomForestClassifier(
#     n_estimators=25,
#     random_state=42,
#     n_jobs=-1,
#     class_weight='balanced'  # Handle imbalance
# )

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# param_grid = {
#     'max_depth': [10, 20],  # None = expand fully
#     'max_features': ['sqrt', 'log2'], # features per split
#     'min_samples_split': [5, 10]   # control overfitting
# }

# grid_search = GridSearchCV(
#     estimator=rf_model,
#     param_grid=param_grid,
#     scoring='roc_auc',  # optimize for AUC
#     cv=cv,
#     n_jobs=-1,
#     verbose=2
# )
# grid_search.fit(X_train, y_train)

# print("Best Parameters:", grid_search.best_params_)
# print("Best ROC AUC:", grid_search.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 5}
Best ROC AUC: 0.7165833109563081


In [None]:
# best_rf = grid_search.best_estimator_
# y_proba = best_rf.predict_proba(X_test)[:, 1]
# print("Test ROC AUC:", roc_auc_score(y_test, y_proba))

Test ROC AUC: 0.716935230809707


In [21]:
# plot_feature_importances(best_rf)

In [24]:
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)

PR AUC: 0.38296485791741053
