In [2]:
# === Standard Libraries ===
import re  # Para trabajar con expresiones regulares

# === Data Manipulation and Analysis ===
import pandas as pd  # Manejo y análisis de datos
import numpy as np  # Operaciones numéricas y manejo de arrays

# === Visualization ===
import matplotlib.pyplot as plt  # Gráficos básicos
import seaborn as sns  # Gráficos estadísticos y estilos
sns.set_style('whitegrid')  # Estilo de gráficos para seaborn

# === Model Selection and Preprocessing ===
from sklearn.model_selection import (
    train_test_split,  # Dividir datos en entrenamiento y prueba
    GridSearchCV,  # Búsqueda de hiperparámetros con validación cruzada
    StratifiedKFold,  # Validación cruzada estratificada
)
from sklearn.preprocessing import StandardScaler  # Escalado de características

# === Pipeline ===
from sklearn.pipeline import Pipeline  # Construcción de pipelines para preprocesamiento y modelos

# === Class Imbalance Handling ===
from imblearn.combine import SMOTETomek  # Combinación de SMOTE y Tomek Links para balancear datos

# === Machine Learning Models ===
from sklearn.ensemble import (
    RandomForestClassifier,  # Clasificador Random Forest
    GradientBoostingClassifier,  # Clasificador Gradient Boosting
    StackingClassifier,  # Clasificador basado en apilamiento
    RandomForestRegressor,  # Regresor Random Forest
    GradientBoostingRegressor,  # Regresor Gradient Boosting
)
from xgboost import XGBClassifier, XGBRegressor  # Clasificador y regresor XGBoost

# === Metrics and Evaluation ===
from sklearn.metrics import (
    classification_report,  # Generar reporte de métricas
    confusion_matrix,  # Matriz de confusión
    roc_auc_score,  # AUC-ROC
    f1_score,  # F1-Score
    recall_score,  # Recall
    precision_score,  # Precision
    mean_absolute_error,  # MAE para regresión
    mean_squared_error,  # MSE para regresión
)

# === Utilities ===
from sklearn.utils.class_weight import compute_class_weight  # Cálculo de pesos por clase
import joblib  # Guardar y cargar modelos entrenados

In [3]:
# Cargar los datos
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_df = pd.read_csv('data/sample_submission.csv')

In [4]:
train_df.head()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
0,9580,0.668999,58,2,0.449504,3425.0,9,1,1,1,1.0,0
1,39755,0.015922,71,0,6.0,,5,0,0,0,0.0,0
2,118799,0.183062,52,1,0.035593,5000.0,9,0,0,0,0.0,0
3,16489,0.162301,77,0,0.227886,2000.0,8,0,0,0,0.0,0
4,149857,0.404199,30,0,0.02601,5843.0,4,0,0,0,0.0,0


In [6]:
test_df.head()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,129460,1.0,21,0,8.0,,0,0,0,0,
1,134018,0.009878,38,0,0.229978,10500.0,10,0,1,0,1.0
2,86523,0.276836,70,0,1914.0,,23,0,1,0,0.0
3,138466,0.045413,75,0,452.0,,4,0,0,0,0.0
4,143905,0.0,82,0,0.0,,5,0,0,0,0.0


In [7]:
sample_df.head()

Unnamed: 0,ID,SeriousDlqin2yrs
0,123470,0
1,124835,0
2,118355,1
3,2432,1
4,92739,0


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105000 entries, 0 to 104999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    105000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  105000 non-null  float64
 2   Age                                   105000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  105000 non-null  int64  
 4   DebtRatio                             105000 non-null  float64
 5   MonthlyIncome                         84164 non-null   float64
 6   NumberOfOpenCreditLinesAndLoans       105000 non-null  int64  
 7   NumberOfTimes90DaysLate               105000 non-null  int64  
 8   NumberRealEstateLoansOrLines          105000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  105000 non-null  int64  
 10  NumberOfDependents                    102236 non-null  float64
 11  

In [9]:
train_df.describe()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
count,105000.0,105000.0,105000.0,105000.0,105000.0,84164.0,105000.0,105000.0,105000.0,105000.0,102236.0,105000.0
mean,75006.458152,5.378324,52.32561,0.409352,352.044192,6703.641,8.459952,0.254619,1.01901,0.228762,0.757933,0.066514
std,43315.742022,201.573457,14.766425,4.056717,1820.229318,16222.88,5.134329,4.032506,1.131065,4.017864,1.115273,0.24918
min,1.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37590.75,0.029974,41.0,0.0,0.17495,3400.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,74941.5,0.154252,52.0,0.0,0.366061,5400.0,8.0,0.0,1.0,0.0,0.0,0.0
75%,112542.5,0.556035,63.0,0.0,0.870083,8250.0,11.0,0.0,2.0,0.0,1.0,0.0
max,149999.0,29110.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0,1.0
