In [16]:
# Imports and data loading
!pip install -U imbalanced-learn==0.10.1 scikit-learn==1.2.2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_recall_curve, classification_report, fbeta_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

# Load datasets
df_train = pd.read_csv('/kaggle/input/creditcsv/cs-training.csv', index_col=0)
df_test = pd.read_csv('/kaggle/input/creditcsv/cs-test.csv', index_col=0)
df_submission = pd.read_csv('/kaggle/input/creditcsv/sampleEntry.csv')

# Basic info
print(df_train.info())
print(df_train.head())

<class 'pandas.core.frame.DataFrame'>
Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtypes: fl

In [14]:
# Advanced data preprocessing and feature engineering

# Impute missing with KNN
imputer = KNNImputer(n_neighbors=5)
df_train_imputed = df_train.copy()
df_train_imputed[['MonthlyIncome', 'NumberOfDependents']] = imputer.fit_transform(df_train_imputed[['MonthlyIncome', 'NumberOfDependents']])

df_test_imputed = df_test.copy()
df_test_imputed[['MonthlyIncome', 'NumberOfDependents']] = imputer.transform(df_test_imputed[['MonthlyIncome', 'NumberOfDependents']])

# Feature engineering example: Debt to income ratio squared, Age buckets
df_train_imputed['DebtRatio_sq'] = df_train_imputed['DebtRatio'] ** 2
df_train_imputed['Age_bucket'] = pd.cut(df_train_imputed['age'], bins=[0,25,40,60,100], labels=[0,1,2,3])

df_test_imputed['DebtRatio_sq'] = df_test_imputed['DebtRatio'] ** 2
df_test_imputed['Age_bucket'] = pd.cut(df_test_imputed['age'], bins=[0,25,40,60,100], labels=[0,1,2,3])

In [17]:
# Prepare data for modeling
target = 'SeriousDlqin2yrs'
features = [c for c in df_train.columns if c != target]

# Si hay categóricas, hacer dummy:
X = pd.get_dummies(df_train[features])

y = df_train[target]

# Imputar faltantes con mediana (en columnas numéricas)
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)

# Escalar
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Balance con SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

In [18]:
# Define custom F2 scorer and threshold finder

def f2_scorer(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)

def find_best_threshold_f2(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f2_scores = 5 * (precision * recall) / (4 * precision + recall + 1e-9)
    best_idx = np.argmax(f2_scores)
    return thresholds[best_idx], f2_scores[best_idx]

In [20]:
# Hyperparameter tuning with Optuna for LightGBM

def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }
    
    model = LGBMClassifier(**param)
    model.fit(X_train_bal, y_train_bal)
    y_val_prob = model.predict_proba(X_val_scaled)[:, 1]
    
    best_thresh, best_f2 = find_best_threshold_f2(y_val, y_val_prob)
    return best_f2

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best params:", study.best_params)
print("Best F2 score:", study.best_value)

[I 2025-06-30 20:56:15,176] A new study created in memory with name: no-name-032dea03-7daf-4503-b7f8-554379907aa5
[I 2025-06-30 20:56:44,827] Trial 0 finished with value: 0.4424778757954692 and parameters: {'n_estimators': 795, 'learning_rate': 0.2513311316041554, 'num_leaves': 141, 'max_depth': 15, 'min_child_samples': 27, 'subsample': 0.5503463112180885, 'colsample_bytree': 0.7720312974141961, 'reg_alpha': 0.5920373394551252, 'reg_lambda': 0.4042993521371503}. Best is trial 0 with value: 0.4424778757954692.
[I 2025-06-30 20:56:57,458] Trial 1 finished with value: 0.500598175264446 and parameters: {'n_estimators': 652, 'learning_rate': 0.06678252952611591, 'num_leaves': 44, 'max_depth': 14, 'min_child_samples': 43, 'subsample': 0.7188810555394671, 'colsample_bytree': 0.5386151140952329, 'reg_alpha': 0.1548663454298579, 'reg_lambda': 0.8265665802233543}. Best is trial 1 with value: 0.500598175264446.
[I 2025-06-30 20:57:13,869] Trial 2 finished with value: 0.4995820983271784 and parame

Best params: {'n_estimators': 215, 'learning_rate': 0.016164681015374455, 'num_leaves': 33, 'max_depth': 14, 'min_child_samples': 100, 'subsample': 0.6421631816787918, 'colsample_bytree': 0.6570997703960657, 'reg_alpha': 0.13231170133425174, 'reg_lambda': 0.5282284076568279}
Best F2 score: 0.512859650164927


In [21]:
# Train final LightGBM with best params

best_params = study.best_params
best_params['random_state'] = 42
best_params['n_jobs'] = -1

final_lgbm = LGBMClassifier(**best_params)
final_lgbm.fit(X_train_bal, y_train_bal)

# Validation predictions
y_val_prob_final = final_lgbm.predict_proba(X_val_scaled)[:,1]
best_thresh, best_f2 = find_best_threshold_f2(y_val, y_val_prob_final)
print(f"Best threshold: {best_thresh:.4f}, Best F2-score: {best_f2:.4f}")

y_val_pred_final = (y_val_prob_final >= best_thresh).astype(int)
print(classification_report(y_val, y_val_pred_final))

Best threshold: 0.4274, Best F2-score: 0.5129
              precision    recall  f1-score   support

           0       0.97      0.87      0.92     27995
           1       0.26      0.67      0.38      2005

    accuracy                           0.85     30000
   macro avg       0.62      0.77      0.65     30000
weighted avg       0.93      0.85      0.88     30000



In [22]:
# Prepare test set and make predictions
df_test_imputed = df_test.copy()
imputer = SimpleImputer(strategy='median')
df_test_imputed[['MonthlyIncome', 'NumberOfDependents']] = imputer.fit_transform(
    df_test_imputed[['MonthlyIncome', 'NumberOfDependents']]
)

# Preprocess test set same as train
X_test = df_test_imputed[features]
X_test_scaled = scaler.transform(X_test)

# Predict probabilities
y_test_prob = final_lgbm.predict_proba(X_test_scaled)[:, 1]

# Predict labels based on best threshold
y_test_pred = (y_test_prob >= best_thresh).astype(int)

# Prepare submission dataframe with only Id and prediction probability
submission = pd.DataFrame({
    'Id': df_submission['Id'],
    'Probability': y_test_prob
})

submission.to_csv('submission.csv', index=False)
print("Submission saved!")

Submission saved!


In [24]:
# Save the trained model
joblib.dump(final_lgbm, 'lgbm_credit_model.pkl')
print("Model saved as 'lgbm_credit_model.pkl'")

Model saved as 'lgbm_credit_model.pkl'
