# Final Random Forest Tuned Fast Model

This notebook trains the selected Random Forest Tuned Fast model on the full dataset and saves all necessary artifacts for deployment.  
The focus is on having a ready-to-use model for the Streamlit app, along with preprocessing information.


In [None]:
# final_model_rf_tuned_fast_complete.py

# --- 0) Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- 1) Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# --- 2) Safe MAPE function ---
def mean_absolute_percentage_error_safe(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero_idx = y_true != 0
    if np.sum(nonzero_idx) == 0:
        return np.nan
    return np.mean(np.abs((y_true[nonzero_idx] - y_pred[nonzero_idx]) / y_true[nonzero_idx])) * 100

# --- 3) Load dataset ---
csv_path = "/content/drive/MyDrive/PORTFOLIO/Proyectos/App_Predicción_Cargas_Sesion/synthetic_full_dataset.csv"
df = pd.read_csv(csv_path)

# --- 4) Define features and targets ---
categorical_features = ["TaskType", "Team"]
numerical_features = [
    "Length (m)", "Width (m)", "Duration (min)", "Players_Team1", "Players_Team2",
    "Jokers", "Goalkeepers", "Total_Players", "Density (m2/player)"
]

output_columns = [
    "Distance - Distance (m)",
    "Distance - Abs HSR (m)",
    "Distance - HSR Rel (m/min)",
    "Distance - Abs HSR (% of distance)",
    "Distance - HIA",
    "Distance - HIBD (m)",
    "Accelerations - High Intensity Acc Abs (count)",
    "Accelerations - High Intensity Acc Abs (m)",
    "Accelerations - High Intensity Dec Abs (count)",
    "Accelerations - High Intensity Dec Abs (m)",
    "Accelerations - Max Acceleration (m/s²)",
    "Accelerations - Max Deceleration (m/s²)",
    "Speed Zones (m) [0.0, 6.0]",
    "Speed Zones (m) [6.0, 12.0]",
    "Speed Zones (m) [12.0, 18.0]",
    "Speed Zones (m) [18.0, 21.0]",
    "Speed Zones (m) [21.0, 24.0]",
    "Speed Zones (m) [24.0, 50.0]"
]

# Drop irrelevant columns
drop_cols = ["Date", "DayLabel", "Player", "iso_week", "weekday"]
df = df.drop(columns=drop_cols)

feature_columns = [col for col in df.columns if col not in output_columns]

# --- 5) Split data ---
X = df[feature_columns]
y = df[output_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 6) Preprocessing pipeline ---
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

# Save preprocessor & columns
joblib.dump(preprocessor, '/content/drive/MyDrive/PORTFOLIO/Proyectos/App_Predicción_Cargas_Sesion/preprocessor.pkl')
joblib.dump(feature_columns, '/content/drive/MyDrive/PORTFOLIO/Proyectos/App_Predicción_Cargas_Sesion/feature_columns.pkl')
joblib.dump(output_columns, '/content/drive/MyDrive/PORTFOLIO/Proyectos/App_Predicción_Cargas_Sesion/output_columns.pkl')

# --- 7) Train final Random Forest Tuned Fast model ---
final_rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)
final_rf_model.fit(X_train_prepared, y_train)

joblib.dump(final_rf_model, '/content/drive/MyDrive/PORTFOLIO/Proyectos/App_Predicción_Cargas_Sesion/final_rf_tuned_fast_model.pkl')

# --- 8) Evaluate model globally ---
y_pred = final_rf_model.predict(X_test_prepared)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error_safe(y_test, y_pred)

print("\n✅ Global Evaluation:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")
print(f"MAPE (safe): {mape:.4f}%")

# --- 9) Evaluate model per target variable ---
print("\n✅ Evaluation per target variable:")
for i, target in enumerate(output_columns):
    y_true_col = y_test.iloc[:, i]
    y_pred_col = y_pred[:, i]

    mae_col = mean_absolute_error(y_true_col, y_pred_col)
    rmse_col = np.sqrt(mean_squared_error(y_true_col, y_pred_col))
    r2_col = r2_score(y_true_col, y_pred_col)
    mape_col = mean_absolute_percentage_error_safe(y_true_col, y_pred_col)

    print(f"\nTarget: {target}")
    print(f"MAE: {mae_col:.4f}")
    print(f"RMSE: {rmse_col:.4f}")
    print(f"R2: {r2_col:.4f}")
    print(f"MAPE (safe): {mape_col:.4f}%")


Mounted at /content/drive

✅ Global Evaluation:
MAE: 21.2028
RMSE: 115.1592
R2: 0.8426
MAPE (safe): 74270331.8740%

✅ Evaluation per target variable:

Target: Distance - Distance (m)
MAE: 113.9137
RMSE: 289.5744
R2: 0.9501
MAPE (safe): 12.7894%

Target: Distance - Abs HSR (m)
MAE: 76.8803
RMSE: 245.4283
R2: 0.9395
MAPE (safe): 14.1515%

Target: Distance - HSR Rel (m/min)
MAE: 3.4521
RMSE: 6.1226
R2: 0.9473
MAPE (safe): 13.6593%

Target: Distance - Abs HSR (% of distance)
MAE: 5.3339
RMSE: 7.5976
R2: 0.8916
MAPE (safe): 13.3002%

Target: Distance - HIA
MAE: 0.7925
RMSE: 1.2565
R2: 0.9294
MAPE (safe): 15.6730%

Target: Distance - HIBD (m)
MAE: 0.6460
RMSE: 1.0588
R2: 0.9292
MAPE (safe): 18.9080%

Target: Accelerations - High Intensity Acc Abs (count)
MAE: 0.7972
RMSE: 1.2570
R2: 0.9292
MAPE (safe): 15.8435%

Target: Accelerations - High Intensity Acc Abs (m)
MAE: 0.6344
RMSE: 1.0287
R2: 0.9312
MAPE (safe): 18.1905%

Target: Accelerations - High Intensity Dec Abs (count)
MAE: 0.7924
RMSE:

## Saving Model and Preprocessing Artifacts

The following files are saved as `.pkl` for later use in the Streamlit app:

- `feature_columns.pkl`: list of input feature names used to train the model.
- `output_columns.pkl`: list of output target variables predicted by the model.
- `preprocessor.pkl`: fitted `ColumnTransformer` that applies scaling to numerical features and one-hot encoding to categorical features.
- `final_rf_tuned_fast_model.pkl`: trained Random Forest model.

These artifacts ensure that the Streamlit app can preprocess new user inputs in exactly the same way as during training, and then generate predictions that are consistent and reliable.

