<a href="https://colab.research.google.com/github/MatP-DS/MasterThesis/blob/main/rf_train_test3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

# Load flattened data
flat_data = np.load("/content/drive/MyDrive/MasterThesis/02_preprocessed_data/X_flattened_final.npz")["X"].copy()
labels = np.load("/content/drive/MyDrive/MasterThesis/02_preprocessed_data/y_labels_final.npz")["y"].copy()

# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    flat_data, labels, test_size=0.2, stratify=labels, random_state=42
)


In [None]:
# Model training (with timing)
start_time = time.time()
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, class_weight="balanced")
rf_model.fit(X_train, y_train)
train_duration = time.time() - start_time

# Predict
y_pred = rf_model.predict(X_test)


In [None]:
# Metrics
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "F1 (loss)": f1_score(y_test, y_pred, pos_label=1),
    "Recall (loss)": recall_score(y_test, y_pred, pos_label=1),
    "Precision (loss)": precision_score(y_test, y_pred, pos_label=1),
    "Training Time (s)": train_duration
}

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["No Loss (0)", "Loss (1)"], columns=["Predicted 0", "Predicted 1"])

metrics_df = pd.DataFrame([metrics])
metrics_df, cm_df


(   Accuracy  F1 (loss)  Recall (loss)  Precision (loss)  Training Time (s)
 0   0.99962   0.869565       0.818713          0.927152         666.894618,
              Predicted 0  Predicted 1
 No Loss (0)       110228           11
 Loss (1)              31          140)

In [None]:
import psutil
print(f"Memory used: {psutil.virtual_memory().used / 1e9:.2f} GB")

Memory used: 3.43 GB


In [None]:
def save_model_results_extended(model_name, model, metrics_df, cm_df=None, folder="/content/drive/MyDrive/MasterThesis/04_results/metrics"):
    """
    Save model evaluation results including metrics, hyperparameters, and confusion matrix.

    Parameters:
    - model_name: str, name of the model (e.g., "random_forest")
    - model: trained model object with `.get_params()` or `.__dict__`
    - metrics_df: pd.DataFrame with one row of evaluation metrics
    - cm_df: pd.DataFrame with confusion matrix (optional)
    - folder: target folder path
    """
    import os
    import json
    from datetime import datetime

    os.makedirs(folder, exist_ok=True)

    # Timestamp for versioned output
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_filename = f"{model_name}_metrics_{timestamp}"

    # Save metrics CSV and JSON
    metrics_path_csv = os.path.join(folder, f"{base_filename}.csv")
    metrics_df.to_csv(metrics_path_csv, index=False)

    # Extract model parameters
    try:
        params = model.get_params()
    except AttributeError:
        params = model.__dict__

    # Combine everything into one JSON log
    log_dict = {
        "model_name": model_name,
        "timestamp": timestamp,
        "metrics": metrics_df.to_dict(orient="records")[0],
        "hyperparameters": {k: str(v) for k, v in params.items()}
    }

    metrics_path_json = os.path.join(folder, f"{base_filename}.json")
    with open(metrics_path_json, 'w') as f:
        json.dump(log_dict, f, indent=4)

    # Save confusion matrix if provided
    cm_path_csv = None
    if cm_df is not None:
        cm_path_csv = os.path.join(folder, f"{base_filename}_confusion_matrix.csv")
        cm_df.to_csv(cm_path_csv)

    return {
        "csv": metrics_path_csv,
        "json": metrics_path_json,
        "confusion_matrix": cm_path_csv
    }

# Example usage:
save_model_results_extended("random_forest", rf_model, metrics_df, cm_df)


{'csv': '/content/drive/MyDrive/MasterThesis/04_results/metrics/random_forest_metrics_20250623_235129.csv',
 'json': '/content/drive/MyDrive/MasterThesis/04_results/metrics/random_forest_metrics_20250623_235129.json',
 'confusion_matrix': '/content/drive/MyDrive/MasterThesis/04_results/metrics/random_forest_metrics_20250623_235129_confusion_matrix.csv'}