<a href="https://colab.research.google.com/github/MatP-DS/MasterThesis/blob/main/lightgbm_train_test4ds2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from lightgbm import LGBMClassifier, early_stopping
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, f1_score, recall_score,
                            precision_score, precision_recall_curve,
                            average_precision_score, confusion_matrix)
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
from google.colab import drive
import psutil
import time
import json
import os

# Install required packages for Colab
!pip install lightgbm scikit-learn psutil

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load data
X = np.load("/content/drive/MyDrive/MasterThesis/02_preprocessed_data/X_flattened_final.npz")["X"]
y = np.load("/content/drive/MyDrive/MasterThesis/02_preprocessed_data/y_labels_final.npz")["y"]


In [None]:

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Compute balanced class weight
pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Enhanced parameter grid
param_grid = {
    'num_leaves': [15, 31, 63, 127],
    'min_data_in_leaf': [20, 50, 100, 200],
    'max_depth': [5, 7, -1],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'min_split_gain': [0, 0.1],
    'feature_fraction': [0.8, 0.9, 1.0],
    'bagging_fraction': [0.8, 0.9, 1.0]
}

thresholds = np.linspace(0.1, 0.9, 17)

def find_optimal_threshold(y_true, y_prob):
    precisions, recalls, threshs = precision_recall_curve(y_true, y_prob)
    f2_scores = (5 * precisions * recalls) / (4 * precisions + recalls + 1e-9)
    optimal_idx = np.argmax(f2_scores)
    return threshs[optimal_idx], precisions[optimal_idx], recalls[optimal_idx]

# Colab-compatible system monitoring
def get_system_metrics():
    metrics = {
        'cpu_usage': psutil.cpu_percent(),
        'ram_used_gb': psutil.virtual_memory().used / (1024**3),
        'gpu_usage': None
    }

    # Try to get GPU info if available
    try:
        import GPUtil
        gpus = GPUtil.getGPUs()
        if gpus:
            metrics['gpu_usage'] = gpus[0].load
    except:
        pass

    return metrics

# Enhanced results logging
def save_full_results(model_name, model, params, metrics, cm, pr_curve,
                     system_metrics, training_time, folder):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    os.makedirs(folder, exist_ok=True)

    # Confusion matrix DataFrame
    cm_df = pd.DataFrame(cm,
                        index=["No Loss (0)", "Loss (1)"],
                        columns=["Predicted 0", "Predicted 1"])

    # Full log dictionary
    log_dict = {
        "model": model_name,
        "timestamp": timestamp,
        "performance": {
            "metrics": metrics,
            "confusion_matrix": cm_df.to_dict(),
            "pr_curve": {
                "precision": pr_curve[0].tolist(),
                "recall": pr_curve[1].tolist(),
                "thresholds": pr_curve[2].tolist() if len(pr_curve) > 2 else None
            }
        },
        "system": {
            "training_time_s": training_time,
            "cpu_usage_percent": system_metrics['cpu_usage'],
            "ram_used_gb": system_metrics['ram_used_gb'],
            "gpu_usage": system_metrics['gpu_usage']
        },
        "hyperparameters": params
    }

    # Save all artifacts
    base_name = f"{model_name}_{timestamp}"


Parameter Search: 100%|██████████| 20/20 [22:26<00:00, 67.31s/it]


TypeError: Object of type int64 is not JSON serializable

In [None]:
def save_full_results(model_name, model, params, metrics, cm, pr_curve,
                     system_metrics, training_time, folder):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    os.makedirs(folder, exist_ok=True)

    # Convert numpy types to native Python types for JSON serialization
    def convert_to_serializable(obj):
        if isinstance(obj, (np.integer, np.int64)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {k: convert_to_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, (list, tuple)):
            return [convert_to_serializable(x) for x in obj]
        else:
            return obj

    # Confusion matrix DataFrame
    cm_df = pd.DataFrame(cm,
                        index=["No Loss (0)", "Loss (1)"],
                        columns=["Predicted 0", "Predicted 1"])

    # Convert all data to serializable formats
    serializable_metrics = convert_to_serializable(metrics)
    serializable_params = convert_to_serializable(params)
    serializable_system_metrics = convert_to_serializable(system_metrics)
    serializable_pr_curve = convert_to_serializable(pr_curve)

    # Full log dictionary
    log_dict = {
        "model": model_name,
        "timestamp": timestamp,
        "performance": {
            "metrics": serializable_metrics,
            "confusion_matrix": cm_df.to_dict(),
            "pr_curve": {
                "precision": serializable_pr_curve[0],
                "recall": serializable_pr_curve[1],
                "thresholds": serializable_pr_curve[2] if len(serializable_pr_curve) > 2 else None
            }
        },
        "system": {
            "training_time_s": convert_to_serializable(training_time),
            "cpu_usage_percent": serializable_system_metrics['cpu_usage'],
            "ram_used_gb": serializable_system_metrics['ram_used_gb'],
            "gpu_usage": serializable_system_metrics['gpu_usage']
        },
        "hyperparameters": serializable_params
    }

    # Save all artifacts
    base_name = f"{model_name}_{timestamp}"

    # 1. Save JSON log
    json_path = os.path.join(folder, f"{base_name}.json")
    with open(json_path, 'w') as f:
        json.dump(log_dict, f, indent=4)

    # 2. Save metrics CSV
    metrics_df = pd.DataFrame([serializable_metrics])
    csv_path = os.path.join(folder, f"{base_name}_metrics.csv")
    metrics_df.to_csv(csv_path, index=False)

    # 3. Save confusion matrix
    cm_path = os.path.join(folder, f"{base_name}_cm.csv")
    cm_df.to_csv(cm_path)

    # 4. Save PR curve plot
    plt.figure(figsize=(10, 6))
    plt.plot(serializable_pr_curve[1], serializable_pr_curve[0], label='LightGBM')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.grid()
    plot_path = os.path.join(folder, f"{base_name}_pr_curve.png")
    plt.savefig(plot_path)
    plt.close()

    return {
        "json": json_path,
        "csv": csv_path,
        "cm": cm_path,
        "plot": plot_path
    }

In [None]:
# Save final results
results = save_full_results(
    model_name="lightgbm_optimized",
    model=best_model,
    params=best_params,
    metrics=best_metrics,
    cm=best_cm,
    pr_curve=best_pr_curve,
    system_metrics=best_metrics["system_metrics"],
    training_time=best_metrics["training_time_s"],
    folder="/content/drive/MyDrive/MasterThesis/04_results/metrics"
)