In [3]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.impute import SimpleImputer

# -------------------------
# 1. Cargar datos
# -------------------------
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# -------------------------
# 2. Preparar características y etiquetas
# -------------------------
# Excluir columnas no predictoras
exclude_columns = ['ID', 'SeriousDlqin2yrs']
features = train_df.columns.difference(exclude_columns)

# Separar características y etiquetas
X_train = train_df[features]
y_train = train_df['SeriousDlqin2yrs']
X_test = test_df[features]

# -------------------------
# 3. Crear modelos y pipelines
# -------------------------
pipelines = {
    'LogisticRegression': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Imputación para Logistic Regression
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=1000, random_state=42))
    ]),
    'RandomForest': Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Imputación para Random Forest
        ('model', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ]),
    'XGBoost': Pipeline([
        ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
    ]),
    'LightGBM': Pipeline([
        ('model', LGBMClassifier(random_state=42, class_weight='balanced'))
    ]),
}

# -------------------------
# 4. Hiperparámetros
# -------------------------
param_grids = {
    'LogisticRegression': {
        'model__C': [0.01, 0.1, 1]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 15]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [4, 6],
        'model__learning_rate': [0.05, 0.1]
    },
    'LightGBM': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [4, 6],
        'model__learning_rate': [0.05, 0.1]
    },
}

# -------------------------
# 5. Optimización y validación cruzada
# -------------------------
scorer = make_scorer(roc_auc_score, needs_proba=True)
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_models = {}
for model_name, pipeline in pipelines.items():
    print(f"\nOptimizing {model_name}...")
    grid_search = GridSearchCV(pipeline, param_grid=param_grids.get(model_name, {}),
                               scoring=scorer, cv=stratified_cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best AUC (CV): {grid_search.best_score_:.4f}")

# -------------------------
# 6. Generar predicciones
# -------------------------
for model_name, model in best_models.items():
    # Predecir probabilidades
    predictions = model.predict_proba(X_test)[:, 1]

    # Crear archivo de submission
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'SeriousDlqin2yrs': predictions
    })
    filename = f'submission_{model_name}.csv'
    submission.to_csv(filename, index=False)
    print(f"Archivo generado: {filename}")





Optimizing LogisticRegression...
Best parameters for LogisticRegression: {'model__C': 0.1}
Best AUC (CV): 0.6985

Optimizing RandomForest...
Best parameters for RandomForest: {'model__max_depth': 10, 'model__n_estimators': 200}
Best AUC (CV): 0.8552

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__n_estimators': 200}
Best AUC (CV): 0.8632

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 6984, number of negative: 98016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002875 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 970
[LightGBM] [Info] Number of data points in the train set: 105000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best parameters for LightGBM: {'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__n_estimators': 200}
Best AUC (CV): 0.8635
Archivo generado: submission_LogisticRegression.csv
Archivo generado: submission_RandomForest.csv
Archivo generado: submission_XGBoost.csv
Archivo generad