# Competición Kaggle 2


### Sección 1: Importar librerias

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV


### Sección 2: Cargar los datos

In [2]:
# Cargar archivos (debes asegurarte de que estén en la ruta correcta)
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Verificar que se cargaron correctamente
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (105000, 12)
Test shape: (45000, 11)


In [3]:
train_df.head()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
0,9580,0.668999,58,2,0.449504,3425.0,9,1,1,1,1.0,0
1,39755,0.015922,71,0,6.0,,5,0,0,0,0.0,0
2,118799,0.183062,52,1,0.035593,5000.0,9,0,0,0,0.0,0
3,16489,0.162301,77,0,0.227886,2000.0,8,0,0,0,0.0,0
4,149857,0.404199,30,0,0.02601,5843.0,4,0,0,0,0.0,0


In [4]:
test_df.head()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,129460,1.0,21,0,8.0,,0,0,0,0,
1,134018,0.009878,38,0,0.229978,10500.0,10,0,1,0,1.0
2,86523,0.276836,70,0,1914.0,,23,0,1,0,0.0
3,138466,0.045413,75,0,452.0,,4,0,0,0,0.0
4,143905,0.0,82,0,0.0,,5,0,0,0,0.0


In [5]:
train_df.describe()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
count,105000.0,105000.0,105000.0,105000.0,105000.0,84164.0,105000.0,105000.0,105000.0,105000.0,102236.0,105000.0
mean,75006.458152,5.378324,52.32561,0.409352,352.044192,6703.641,8.459952,0.254619,1.01901,0.228762,0.757933,0.066514
std,43315.742022,201.573457,14.766425,4.056717,1820.229318,16222.88,5.134329,4.032506,1.131065,4.017864,1.115273,0.24918
min,1.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37590.75,0.029974,41.0,0.0,0.17495,3400.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,74941.5,0.154252,52.0,0.0,0.366061,5400.0,8.0,0.0,1.0,0.0,0.0,0.0
75%,112542.5,0.556035,63.0,0.0,0.870083,8250.0,11.0,0.0,2.0,0.0,1.0,0.0
max,149999.0,29110.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0,1.0


### Sección 3: Separar variables

In [6]:
# Variable objetivo
y_train = train_df['SeriousDlqin2yrs']

# Variables predictoras
X_train = train_df.drop('SeriousDlqin2yrs', axis=1)
X_test = test_df.copy()

# Guardamos IDs del test
test_ids = X_test['ID']


### Sección 4: Limpieza - Reemplazo de valores atípicos y NaNs

In [7]:
# Columnas con valores atípicos codificados como 96 o 98
outlier_cols = [
    'NumberOfTime30-59DaysPastDueNotWorse',
    'NumberOfTimes90DaysLate',
    'NumberOfTime60-89DaysPastDueNotWorse'
]

# Reemplazar en train y test
for col in outlier_cols:
    X_train[col] = X_train[col].replace({96: np.nan, 98: np.nan})
    X_test[col] = X_test[col].replace({96: np.nan, 98: np.nan})


### Sección 5: Imputación de valores faltantes con la mediana

In [8]:
# Calcular medianas en train
medians = X_train.median()

# Imputar en ambos conjuntos
X_train.fillna(medians, inplace=True)
X_test.fillna(medians, inplace=True)

# Verificación
print("Faltantes en X_train:", X_train.isnull().sum().sum())
print("Faltantes en X_test:", X_test.isnull().sum().sum())


Faltantes en X_train: 0
Faltantes en X_test: 0


### Sección 6: Eliminar columna 'ID'

In [9]:
# Eliminar ID de ambos conjuntos
X_train.drop('ID', axis=1, inplace=True)
X_test.drop('ID', axis=1, inplace=True)


### Sección 7: Calcular scale_pos_weight

In [10]:
neg, pos = y_train.value_counts()
scale_pos_weight_value = neg / pos
print(f"scale_pos_weight: {scale_pos_weight_value:.2f}")


scale_pos_weight: 14.03


### Sección 8: Definir modelo y espacio de búsqueda

In [11]:
# Modelo base
model = lgb.LGBMClassifier(
    objective='binary',
    metric='auc',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight_value
)

# Espacio de búsqueda
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 50],
    'max_depth': [-1, 10, 20],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}


### Sección 9: Búsqueda de hiperparámetros con CV

In [12]:
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=10, 
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=3),
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Iniciar búsqueda
random_search.fit(X_train, y_train)

# Mejor resultado
print("Mejores parámetros:", random_search.best_params_)
print(f"Mejor AUC promedio: {random_search.best_score_:.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 6984, number of negative: 98016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 105000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066514 -> initscore=-2.641509
[LightGBM] [Info] Start training from score -2.641509
Mejores parámetros: {'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0.5, 'num_leaves': 31, 'n_estimators': 500, 'min_child_samples': 100, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
Mejor AUC promedio: 0.8622


### Sección 10: Entrenamiento final

In [13]:
# Modelo final con mejores parámetros
final_model = lgb.LGBMClassifier(
    objective='binary',
    metric='auc',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight_value,
    **random_search.best_params_
)

# Entrenar con todo el conjunto
final_model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 6984, number of negative: 98016
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003504 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 105000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066514 -> initscore=-2.641509
[LightGBM] [Info] Start training from score -2.641509


### Sección 11: Predicciones

In [14]:
# Predicción de probabilidades (para AUC y envío)
y_pred_proba = final_model.predict_proba(X_test)[:, 1]
print("Primeras 10 predicciones de probabilidad:", y_pred_proba[:10])


Primeras 10 predicciones de probabilidad: [0.76625723 0.09382167 0.09692313 0.06760889 0.04413982 0.92927743
 0.04163162 0.60982751 0.21954878 0.29182939]


###  Sección 12: Crear archivo de envío

In [15]:
submission = pd.DataFrame({
    'ID': test_ids,
    'SeriousDlqin2yrs': y_pred_proba
})

submission.to_csv('sample_submission.csv', index=False)
print("Archivo de envío 'sample_submission.csv' creado.")


Archivo de envío 'sample_submission.csv' creado.
