# Random Forest

## Setup & Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance

# Create results directory for this notebook
BASE_DIR = os.path.dirname(os.getcwd())   # parent directory
RESULTS_DIR = os.path.join(BASE_DIR, 'results', '05_random_forest')
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"‚úÖ Results will be saved to: {RESULTS_DIR}")

np.random.seed(42)

‚úÖ Results will be saved to: C:\Users\jumia\Downloads\BackTap\results\05_random_forest


## 2. Load Features & Dataset

In [8]:
def load_statistical_features(features_path='backtapbench_standard/features/statistical_features.csv'):
    """
    Load the statistical features extracted in Notebook 2/3.
    """
    if not os.path.exists(features_path):
        raise FileNotFoundError(f"Features file not found: {features_path}")
    
    df = pd.read_csv(features_path)
    print(f"‚úÖ Features loaded: {df.shape[0]} samples, {df.shape[1]} columns")
    
    # Drop identifier columns (not features)
    cols_to_drop = ['participant', 'grid_position', 'segment_id']
    available_cols = [c for c in cols_to_drop if c in df.columns]
    
    if 'label' in df.columns:
        X = df.drop(columns=['label'] + available_cols)
        y = df['label']
    else:
        # Alternative: label might be named differently
        X = df.drop(columns=available_cols)
        # Assume the last column is the label if not explicitly named
        y = df.iloc[:, -1]
        X = X.iloc[:, :-1]
    
    print(f"   Feature matrix shape: {X.shape}")
    print(f"   Number of features: {X.shape[1]}")
    print(f"   Target classes: {y.unique()}")
    
    return X, y

# Load the feature dataset
X_features, y_features = load_statistical_features()

FileNotFoundError: Features file not found: backtapbench_standard/features/statistical_features.csv

## 3. Train/Test Split

In [11]:
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_features)

# Split (same random_state=42, test_size=0.2, stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\nüìä Data split:")
print(f"   Training samples: {X_train.shape[0]}")
print(f"   Test samples: {X_test.shape[0]}")
print(f"   Feature dimension: {X_train.shape[1]}")

NameError: name 'y_features' is not defined

## 4. Baseline Random Forest (Default Parameters)

In [14]:
# Train a simple Random Forest first to get a quick baseline
start_time = time.time()

rf_baseline = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_baseline.fit(X_train, y_train)

train_time = time.time() - start_time

# Predictions
y_pred_baseline = rf_baseline.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)

print(f"\nüöÄ Baseline Random Forest (default params)")
print(f"   Training time: {train_time:.2f} seconds")
print(f"   Test accuracy: {accuracy_baseline*100:.2f}%")

NameError: name 'X_train' is not defined

## 5. Hyperparameter Tuning with GridSearchCV

In [None]:
# Define parameter grid (lightweight for reasonable runtime)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Create GridSearchCV object
print("üîç Starting hyperparameter tuning...")
grid_start = time.time()

rf_grid = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    rf_grid, param_grid, 
    cv=5, scoring='accuracy', 
    n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

grid_time = time.time() - grid_start

print(f"\n‚úÖ Grid search completed in {grid_time:.2f} seconds")
print(f"   Best parameters: {grid_search.best_params_}")
print(f"   Best cross-validation accuracy: {grid_search.best_score_*100:.2f}%")

## Evaluate Optimized Random Forest

In [None]:
# Best model from grid search
best_rf = grid_search.best_estimator_

# Predictions on test set
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nüéØ Optimized Random Forest Performance")
print(f"   Test accuracy: {accuracy*100:.2f}%")

# Classification report
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Save classification report as text
with open(os.path.join(RESULTS_DIR, 'classification_report.txt'), 'w') as f:
    f.write(classification_report(y_test, y_pred, target_names=le.classes_))

## 7. Confusion Matrix & Per-Class Accuracy