#  Notebook 02 - Classificazione binaria

Classificazione del rischio di insolvenza con modelli di Machine Learning.

Questo notebook utilizza il dataset pre-elaborato salvato in `application_train_cleaned.csv`.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


In [None]:
# 🔽 Caricamento dati pre-elaborati
df = pd.read_csv("Dataset/application_train_cleaned.csv")
print(f" Dataset caricato: {df.shape[0]} righe, {df.shape[1]} colonne")
df.head()


In [None]:
# 🎯 Separazione tra features e target
X = df.drop("TARGET", axis=1)
y = df["TARGET"]

# 🔀 Train-test split con stratificazione
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("✅ Split completato:")
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Carica il dataset preprocessato
df = pd.read_csv("Dataset/application_train_cleaned.csv")

X = df.drop("TARGET", axis=1)
y = df["TARGET"]

pipelines = {
    "Random Forest": Pipeline([
        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1))
    ]),
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
    ]),
    "LightGBM": Pipeline([
        ('classifier', LGBMClassifier(random_state=42))
    ]),
    "XGBoost": Pipeline([
        ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',
                                    scale_pos_weight=(y == 0).sum() / (y == 1).sum(), n_jobs=-1))
    ]),
}

# K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, pipe in pipelines.items():
    print(f"Valutazione: {name}")
    scores = cross_val_score(pipe, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
    results.append({
        "Model": name,
        "F1 macro mean": scores.mean(),
        "F1 macro std": scores.std()
    })
    print(f"{name}: F1 macro (CV) = {scores.mean():.3f} ± {scores.std():.3f}")

df_results = pd.DataFrame(results)
print("\nCross-validation results:")
print(df_results.sort_values("F1 macro mean", ascending=False))

plt.figure(figsize=(8, 5))
bars = plt.bar(df_results["Model"], df_results["F1 macro mean"],
         yerr=df_results["F1 macro std"], capsize=5, alpha=0.7, color='skyblue')
plt.title("F1 macro (mean ± std)", fontsize=14)
plt.ylabel("F1 macro", fontsize=12)
plt.ylim(0, 1)

for i, (mean, std) in enumerate(zip(df_results["F1 macro mean"], df_results["F1 macro std"])):
    plt.text(i, mean + std + 0.02, f'{mean:.3f}±{std:.3f}', 
         ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()



## 📌 Considerazioni iniziali

Abbiamo testato tre modelli base:
- **Logistic Regression** con bilanciamento interno (`class_weight='balanced'`)
- **Random Forest** robusto e non sensibile a feature scaling
- **LightGBM** molto competitivo in letteratura
