In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer

# -------------------------
# 1. Cargar datos
# -------------------------
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# -------------------------
# 2. Preparar características y etiquetas
# -------------------------
# Excluir columnas no predictoras
exclude_columns = ['ID', 'SeriousDlqin2yrs']
features = train_df.columns.difference(exclude_columns)

# Separar características y etiquetas
X = train_df[features]
y = train_df['SeriousDlqin2yrs']
X_test = test_df[features]

# Imputar valores faltantes
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

# Dividir en conjuntos de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# -------------------------
# 3. Optimizar LightGBM
# -------------------------
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'class_weight': ['balanced']
}

lgbm = LGBMClassifier(random_state=42)
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    lgbm, param_grid, scoring='roc_auc', cv=stratified_cv, n_jobs=-1, verbose=1
)

print("\nOptimizing LightGBM...")
grid_search.fit(X_train, y_train)

# Mejor modelo
best_lgbm = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best AUC (CV): {grid_search.best_score_:.4f}")

# -------------------------
# 4. Evaluar en conjunto de entrenamiento y validación
# -------------------------
def evaluate_model(model, X_train, y_train, X_val, y_val):
    # Predicciones en entrenamiento
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]

    # Predicciones en validación
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]

    # Métricas en entrenamiento
    train_auc = roc_auc_score(y_train, y_train_prob)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("\nTraining Metrics:")
    print(f"ROC AUC: {train_auc:.4f}")
    print(f"Accuracy: {train_accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_train, y_train_pred))
    print("\nClassification Report:")
    print(classification_report(y_train, y_train_pred))

    # Métricas en validación
    val_auc = roc_auc_score(y_val, y_val_prob)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print("\nValidation Metrics:")
    print(f"ROC AUC: {val_auc:.4f}")
    print(f"Accuracy: {val_accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_val_pred))

evaluate_model(best_lgbm, X_train, y_train, X_val, y_val)

# -------------------------
# 5. Generar predicciones para test
# -------------------------
test_predictions = best_lgbm.predict_proba(X_test)[:, 1]

# Crear archivo de submission
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'SeriousDlqin2yrs': test_predictions
})
submission.to_csv('submission_LightGBM.csv', index=False)
print("\nArchivo generado: submission_LightGBM.csv")




Optimizing LightGBM...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Number of positive: 5587, number of negative: 78413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 84000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best parameters: {'class_weight': 'balanced', 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200}
Best AUC (CV): 0.8627

Training Metrics:
ROC AUC: 0.8781
Accuracy: 0.8021
Confusion Matrix:
[[62907 15506]
 [ 1121  4466]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88     78413
           1       0.22      0.80      0.35      5587

    acc