<a href="https://colab.research.google.com/github/MatP-DS/MasterThesis/blob/main/XGBoost_train_test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load data again after kernel reset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter
import time
import xgboost as xgb


In [None]:
# Load from Drive

from google.colab import drive
drive.mount('/content/drive')

X = np.load("/content/drive/MyDrive/MasterThesis/02_preprocessed_data/X_flattened_final.npz")["X"]
y = np.load("/content/drive/MyDrive/MasterThesis/02_preprocessed_data/y_labels_final.npz")["y"]



Mounted at /content/drive


In [None]:
# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Compute scale_pos_weight to handle imbalance
class_counts = Counter(y_train)
scale = class_counts[0] / class_counts[1]


In [None]:
# Define and train XGBoost model
model = xgb.XGBClassifier(
    objective="binary:logistic",
    scale_pos_weight=scale,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    n_jobs=-1,
    verbosity=0,
    use_label_encoder=False,
    random_state=42
)

start = time.time()
model.fit(X_train, y_train)
train_time = time.time() - start


In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "F1 (loss)": f1_score(y_test, y_pred, pos_label=1),
    "Recall (loss)": recall_score(y_test, y_pred, pos_label=1),
    "Precision (loss)": precision_score(y_test, y_pred, pos_label=1),
    "Training Time (s)": train_time
}


In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["No Loss (0)", "Loss (1)"], columns=["Predicted 0", "Predicted 1"])
metrics_df = pd.DataFrame([metrics])

metrics_df, cm_df


(   Accuracy  F1 (loss)  Recall (loss)  Precision (loss)  Training Time (s)
 0  0.999647   0.891967        0.94152          0.847368         200.007808,
              Predicted 0  Predicted 1
 No Loss (0)       110210           29
 Loss (1)              10          161)

In [None]:
# Save metrics_df from your evaluation
metrics_df.to_csv('/content/drive/MyDrive/MasterThesis/04_results/metrics/xgboost_metrics.csv', index=False)


# Confusion matrix (optional)
cm_df.to_csv('/content/drive/MyDrive/MasterThesis/04_results/metrics/xgboost_confusion_matrix.csv')


In [None]:
def save_model_results_extended(model_name, model, metrics_df, cm_df=None, folder="/content/drive/MyDrive/MasterThesis/04_results/metrics"):
    """
    Save model evaluation results including metrics, hyperparameters, and confusion matrix.

    Parameters:
    - model_name: str, name of the model (e.g., "xgboost", "random_forest")
    - model: trained model object with `.get_params()` or `.__dict__`
    - metrics_df: pd.DataFrame with one row of evaluation metrics
    - cm_df: pd.DataFrame with confusion matrix (optional)
    - folder: target folder path
    """
    import os
    import json
    from datetime import datetime

    os.makedirs(folder, exist_ok=True)

    # Timestamp for versioned output
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_filename = f"{model_name}_metrics_{timestamp}"

    # Save metrics CSV and JSON
    metrics_path_csv = os.path.join(folder, f"{base_filename}.csv")
    metrics_df.to_csv(metrics_path_csv, index=False)

    # Extract model parameters
    try:
        params = model.get_params()
    except AttributeError:
        # fallback for native lightgbm or xgboost booster objects
        params = model.__dict__

    # Combine everything into one JSON log
    log_dict = {
        "model_name": model_name,
        "timestamp": timestamp,
        "metrics": metrics_df.to_dict(orient="records")[0],
        "hyperparameters": {k: str(v) for k, v in params.items()}
    }

    # Save full metadata log as JSON
    metrics_path_json = os.path.join(folder, f"{base_filename}.json")
    with open(metrics_path_json, 'w') as f:
        json.dump(log_dict, f, indent=4)

    # Save confusion matrix separately if given
    cm_path_csv = None
    if cm_df is not None:
        cm_path_csv = os.path.join(folder, f"{base_filename}_confusion_matrix.csv")
        cm_df.to_csv(cm_path_csv)

    return {
        "csv": metrics_path_csv,
        "json": metrics_path_json,
        "confusion_matrix": cm_path_csv
    }


In [None]:
def save_model_results_extended(model_name, model, metrics_df, cm_df=None,
                              folder="/content/drive/MyDrive/MasterThesis/04_results/metrics"):
    """
    Enhanced logging function that aligns with research questions while maintaining compatibility
    with your existing code structure.
    """
    import os
    import json
    from datetime import datetime
    import psutil
    import timeit
    import pickle

    os.makedirs(folder, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Convert confusion matrix if provided
    cm_dict = None
    if cm_df is not None:
        cm_dict = {
            "columns": cm_df.columns.tolist(),
            "data": cm_df.values.tolist()
        }

    # Get basic system metrics (will work even without GPU)
    cpu_usage = psutil.cpu_percent()
    ram_used = psutil.virtual_memory().used / (1024**3)  # GB

    try:
        import GPUtil
        gpu_usage = GPUtil.getGPUs()[0].load if GPUtil.getGPUs() else None
    except:
        gpu_usage = None

    # Create the comprehensive log dictionary
    log_dict = {
        "model": model_name,
        "timestamp": timestamp,
        "metrics": {
            "accuracy": float(metrics_df["Accuracy"].iloc[0]),
            "f1_loss": float(metrics_df["F1 (loss)"].iloc[0]),
            "recall_loss": float(metrics_df["Recall (loss)"].iloc[0]),
            "precision_loss": float(metrics_df["Precision (loss)"].iloc[0]),
            "training_time_s": float(metrics_df["Training Time (s)"].iloc[0])
        },
        "confusion_matrix": cm_dict,
        "system_metrics": {
            "cpu_usage_percent": cpu_usage,
            "ram_used_gb": ram_used,
            "gpu_usage": gpu_usage
        },
        "hyperparameters": {k: str(v) for k, v in model.get_params().items()}
    }

    # Save JSON log
    json_path = os.path.join(folder, f"{model_name}_metrics_{timestamp}.json")
    with open(json_path, 'w') as f:
        json.dump(log_dict, f, indent=4)

    # Maintain your original CSV outputs for compatibility
    csv_path = os.path.join(folder, f"{model_name}_metrics_{timestamp}.csv")
    metrics_df.to_csv(csv_path, index=False)

    if cm_df is not None:
        cm_path = os.path.join(folder, f"{model_name}_confusion_matrix_{timestamp}.csv")
        cm_df.to_csv(cm_path)

    return {
        "json_path": json_path,
        "csv_path": csv_path,
        "cm_path": cm_path if cm_df is not None else None
    }

In [None]:
# Your existing code remains unchanged:
results = save_model_results_extended("xgboost", model, metrics_df, cm_df)
