# Imports

In [2]:
# General

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Hyperparameter tuning
import optuna

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Data preparation

In [3]:
# Data input

try:
    data = pd.read_csv('../data/train.csv')
    data_to_predict = pd.read_csv('../data/test.csv')
    data_ccrisk = pd.read_csv('../data/credit_risk_dataset.csv')

except:
    data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
    data_to_predict = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
    data_ccrisk = pd.read_csv('/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv')


data_ccrisk.dropna(inplace=True)
data_ccrisk.drop_duplicates(inplace=True)

X_old = data.drop(['loan_status'], axis=1)
y_old = data['loan_status']

# Merge the dataframes
data_no_id = data.drop(['id'], axis=1)
merged_data = pd.concat([data_no_id, data_ccrisk], ignore_index=True)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/playground-series-s4e10/train.csv'

In [None]:
def remove_col_prefix(dataframe, prefix):
	dataframe.columns = dataframe.columns.str.replace(prefix, '')
	return dataframe

merged_data = remove_col_prefix(merged_data, 'person_')
merged_data = remove_col_prefix(merged_data, 'loan_')


print(merged_data.isnull().sum())
print(merged_data.duplicated().sum())

age                    0
income                 0
home_ownership         0
emp_length             0
intent                 0
grade                  0
amnt                   0
int_rate               0
percent_income         0
cb_default_on_file     0
cb_cred_hist_length    0
status                 0
dtype: int64
0


# Feature engineering

## Feature engineering function

In [None]:
from scipy.stats import zscore

numeric_columns = merged_data.select_dtypes(include=[np.number]).columns

def remove_outiers(dataframe, numeric_cols, z_threshold=4):
	dataframe = dataframe[(np.abs(zscore(dataframe[numeric_cols])) < z_threshold).all(axis=1)].copy()
	return dataframe

data_no_outliers = remove_outiers(merged_data, numeric_columns)

merged_data.shape, data_no_outliers.shape# Create income bins before calling the function
income_bins = pd.qcut(data_no_outliers['income'], q=5, retbins=True)[1]

def feature_engineering(dataframe, income_bins):

    # Map grades to numerical values
    grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    dataframe['grade_numeric'] = dataframe['grade'].map(grade_mapping)

    # Map cb_default_on_file to binary values
    dataframe['cb_default_on_file'] = dataframe['cb_default_on_file'].map({'Y': 1, 'N': 0})
    
    # Feature engineering
    dataframe['total_income'] = (dataframe['emp_length'] + 1) * dataframe['income']
    dataframe['emp_length_to_age_ratio'] = dataframe['emp_length'] / dataframe['age']
    dataframe['income_to_age_ratio'] = dataframe['income'] / dataframe['age']
    dataframe['total_income_to_age_ratio'] = dataframe['total_income'] / dataframe['age']
    dataframe['income_category'] = pd.cut(dataframe['income'], bins=income_bins, labels=[1, 2, 3, 4, 5])
    dataframe['income_to_loan_ratio'] = dataframe['income'] / dataframe['amnt']
    dataframe['total_income_to_loan_ratio'] = dataframe['total_income'] / dataframe['amnt']
    dataframe['loan_income_age_interaction'] = dataframe['amnt'] / (dataframe['income'] * dataframe['age'])
    dataframe['income_grade_interaction'] = dataframe['income'] * dataframe['grade_numeric']
    dataframe['loan_grade_ratio'] = dataframe['amnt'] / dataframe['grade_numeric']
    dataframe['emp_to_cred_hist_ratio'] = dataframe['emp_length'] / (dataframe['cb_cred_hist_length'] + 1)
    dataframe['credit_behavior_score'] = (1 - dataframe['cb_default_on_file']) * dataframe['cb_cred_hist_length']
    dataframe['affordability_score'] = dataframe['income'] / (dataframe['amnt'] * (1 + dataframe['int_rate']/100))
    dataframe['interest_cost'] = dataframe['amnt'] * dataframe['int_rate'] / 100 
    dataframe['emp_length_loan_interaction'] = dataframe['emp_length'] * dataframe['amnt']

    # Lists of numeric and ordinal features
    numeric = [
        'total_income', 'emp_length_to_age_ratio', 'income_to_age_ratio', 
        'total_income_to_age_ratio', 'income_to_loan_ratio', 
        'total_income_to_loan_ratio', 'loan_income_age_interaction', 
        'income_grade_interaction', 'loan_grade_ratio', 'emp_to_cred_hist_ratio', 
        'credit_behavior_score', 'affordability_score', 'interest_cost', 
        'emp_length_loan_interaction'
    ]
    ordinal = ['income_category']

    return dataframe, numeric, ordinal

In [None]:
# Addind the new features to the data
data_no_outliers, new_features_numeric, new_features_ordinal = feature_engineering(data_no_outliers, income_bins)
X = data_no_outliers.drop(['status'], axis=1)
y = data_no_outliers['status']

X

Unnamed: 0,age,income,home_ownership,emp_length,intent,grade,amnt,int_rate,percent_income,cb_default_on_file,...,income_to_loan_ratio,total_income_to_loan_ratio,loan_income_age_interaction,income_grade_interaction,loan_grade_ratio,emp_to_cred_hist_ratio,credit_behavior_score,affordability_score,interest_cost,emp_length_loan_interaction
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,0,...,5.833333,5.833333,0.004633,70000,3000.000000,0.000000,14,5.232158,689.4,0.0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,0,...,14.000000,98.000000,0.003247,168000,1333.333333,2.000000,2,12.351125,534.0,24000.0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,0,...,4.800000,43.200000,0.007184,28800,6000.000000,0.727273,10,4.407713,534.0,48000.0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,0,...,5.833333,87.500000,0.005714,140000,6000.000000,2.333333,5,5.250053,1333.2,168000.0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,0,...,10.000000,30.000000,0.004545,60000,6000.000000,0.500000,3,9.352787,415.2,12000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87055,51,76960,MORTGAGE,0.0,PERSONAL,B,15000,9.99,0.19,0,...,5.130667,5.130667,0.003822,153920,7500.000000,0.000000,20,4.664666,1498.5,0.0
87122,51,80000,RENT,3.0,HOMEIMPROVEMENT,F,18000,18.78,0.23,1,...,4.444444,17.777778,0.004412,480000,3000.000000,0.142857,0,3.741745,3380.4,54000.0
87133,52,30000,RENT,6.0,VENTURE,C,5000,12.73,0.17,0,...,6.000000,42.000000,0.003205,90000,1666.666667,0.315789,18,5.322452,636.5,30000.0
87139,52,65004,RENT,4.0,PERSONAL,D,20000,15.58,0.31,1,...,3.250200,16.251000,0.005917,260016,5000.000000,0.200000,0,2.812078,3116.0,80000.0


In [None]:
# Cleaning and adding the new features to the data to predict
data_to_predict = remove_col_prefix(data_to_predict, 'person_')
data_to_predict = remove_col_prefix(data_to_predict, 'loan_')
data_to_predict = feature_engineering(data_to_predict, income_bins)[0]

In [None]:
column_names = data_to_predict.columns
column_names, X.dtypes

(Index(['age', 'income', 'home_ownership', 'emp_length', 'intent', 'grade',
        'amnt', 'int_rate', 'percent_income', 'cb_default_on_file',
        'cb_cred_hist_length', 'status', 'grade_numeric', 'total_income',
        'emp_length_to_age_ratio', 'income_to_age_ratio',
        'total_income_to_age_ratio', 'income_category', 'income_to_loan_ratio',
        'total_income_to_loan_ratio', 'loan_income_age_interaction',
        'income_grade_interaction', 'loan_grade_ratio',
        'emp_to_cred_hist_ratio', 'credit_behavior_score',
        'affordability_score', 'interest_cost', 'emp_length_loan_interaction'],
       dtype='object'),
 age                               int64
 income                            int64
 home_ownership                   object
 emp_length                      float64
 intent                           object
 grade                            object
 amnt                              int64
 int_rate                        float64
 percent_income             

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [None]:
onehot_cols = ['home_ownership', 'intent', 'cb_default_on_file']
ordinal_cols = ['grade', 'income_category']
standardize_cols = [
    'age', 'income', 'emp_length', 'amnt', 'int_rate', 'percent_income', 
    'cb_cred_hist_length', 'total_income', 'emp_length_to_age_ratio', 
    'income_to_age_ratio', 'total_income_to_age_ratio', 'income_to_loan_ratio', 
    'total_income_to_loan_ratio', 'loan_income_age_interaction', 
    'income_grade_interaction', 'loan_grade_ratio', 'emp_to_cred_hist_ratio', 
    'credit_behavior_score', 'affordability_score', 'interest_cost', 
    'emp_length_loan_interaction', 'grade_numeric'
]
log_transform_cols = ['income', 'total_income', 'amnt', 'interest_cost']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

def log_transform():
	return FunctionTransformer(np.log1p, validate=True)

# Imputers for missing values
ordinal_imputer = SimpleImputer(strategy='most_frequent')
onehot_imputer = SimpleImputer(strategy='most_frequent')
scaler_imputer = SimpleImputer(strategy='mean')
log_imputer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', Pipeline([('imputer', ordinal_imputer), ('encoder', OrdinalEncoder())]), ordinal_cols),
		('onehot', Pipeline([('imputer', onehot_imputer), ('encoder', OneHotEncoder())]), onehot_cols),
		('scaler', Pipeline([('imputer', scaler_imputer), ('scaler', StandardScaler())]), standardize_cols),
		('log', Pipeline([('imputer', log_imputer), ('log', log_transform())]), log_transform_cols)
	], 
	remainder='passthrough',
	)

preprocessor.fit(X_train)

X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)
data_prep = preprocessor.transform(data_no_outliers)
X_all_prep = preprocessor.transform(X)

In [None]:
# Column naming with prefix for clarity
standardize_cols_names = []
for name in standardize_cols:
	standardize_cols_names.append('std_' + name)

log_transform_cols_names = []
for name in log_transform_cols:
	log_transform_cols_names.append('log_' + name)
    
cols = preprocessor.transformers_[0][1]['encoder'].get_feature_names_out(ordinal_cols).tolist()
cols += preprocessor.transformers_[1][1]['encoder'].get_feature_names_out(onehot_cols).tolist()
cols += standardize_cols_names
cols += log_transform_cols_names

training_data_df = pd.DataFrame(X_train_prep, columns=cols)

training_plus_target = training_data_df.copy()
training_plus_target['status'] = y_train.values

# Taking a look at correlation
training_plus_target.corr()['status'].sort_values(ascending=False)

status                             1.000000
grade                              0.390432
std_grade_numeric                  0.390432
std_percent_income                 0.380154
std_loan_income_age_interaction    0.363538
std_int_rate                       0.345783
std_interest_cost                  0.248579
home_ownership_RENT                0.235389
log_interest_cost                  0.220479
cb_default_on_file_1               0.186356
std_amnt                           0.133784
log_amnt                           0.105537
std_income_grade_interaction       0.096991
intent_DEBTCONSOLIDATION           0.062272
intent_MEDICAL                     0.050808
intent_HOMEIMPROVEMENT             0.031135
home_ownership_OTHER               0.003491
std_emp_length_loan_interaction   -0.002656
intent_PERSONAL                   -0.011013
std_cb_cred_hist_length           -0.013980
std_age                           -0.015303
intent_EDUCATION                  -0.056155
intent_VENTURE                  

# Models

## XGBoost

In [None]:
def xgb_objective(trial):
    # Hyperparameter search space
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'auc',  # AUC is the metric
        'objective': 'binary:logistic'  # Use binary logistic, probability outputs
    }

    # Initialize the model
    model = xgb.XGBClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train, 
              eval_set=[(X_test_prep, y_test)],  
              verbose=False)

    # Predict proba for validation set
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]

    auc = roc_auc_score(y_test, y_pred_prob)
 
    return auc

# Study object to run the optimization. I want to maximize AUC
xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=500, timeout=3600*2)

print(f"Best trial: {xgb_study.best_trial.params}")

[I 2024-10-31 16:02:07,858] A new study created in memory with name: no-name-358c76c5-e610-4466-8687-8f4904f39901
[I 2024-10-31 16:02:15,714] Trial 0 finished with value: 0.9500792045236812 and parameters: {'max_depth': 6, 'learning_rate': 0.2293494537019517, 'n_estimators': 451, 'subsample': 0.9085805573664184, 'colsample_bytree': 0.8886206491890367, 'gamma': 0.48474332742196896, 'lambda': 0.20043577961299971, 'alpha': 4.506421605253883e-05, 'scale_pos_weight': 1.5429807693918027}. Best is trial 0 with value: 0.9500792045236812.
[I 2024-10-31 16:02:21,331] Trial 1 finished with value: 0.9474706290960709 and parameters: {'max_depth': 6, 'learning_rate': 0.28216118561305686, 'n_estimators': 328, 'subsample': 0.9196988755107399, 'colsample_bytree': 0.8354086555720813, 'gamma': 0.3379984388905105, 'lambda': 3.164512230207635e-08, 'alpha': 1.6964194852379622e-08, 'scale_pos_weight': 2.926591459263859}. Best is trial 0 with value: 0.9500792045236812.
[I 2024-10-31 16:02:30,941] Trial 2 fini

Best trial: {'max_depth': 5, 'learning_rate': 0.09071934481163234, 'n_estimators': 477, 'subsample': 0.9364296482600055, 'colsample_bytree': 0.7656377652218407, 'gamma': 0.3778302171303889, 'lambda': 1.1354525660669237e-06, 'alpha': 0.0024598591905306668, 'scale_pos_weight': 2.1368290316656604}


In [None]:
xgb_best_params = xgb_study.best_trial.params
xgb_best_score = xgb_study.best_trial.value
xgb_best_score

0.9560193210022325

## CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier

def cat_objective(trial):

    # Define the hyperparameter search space

    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'eval_metric': 'AUC',  
        'logging_level': 'Silent',  # Disable CatBoost output
        'task_type': 'CPU',  # Try using GPU
        'use_best_model': True
    }

    # Initialize the CatBoost model with the trial's parameters
    model = CatBoostClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train,
              eval_set=[(X_test_prep, y_test)],
              early_stopping_rounds=20,  # Early stopping to prevent overfitting
              verbose=False)

    # Get predictions and calculate AUC score
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]  # Probabilities for class 1

    auc = roc_auc_score(y_test, y_pred_prob)

    return auc

cat_study = optuna.create_study(direction='maximize')  # We want to maximize AUC

cat_study.optimize(cat_objective, n_trials=500, timeout=3600*2)

# Output the best trial
print(f"Best trial: {cat_study.best_trial.params}")


[I 2024-10-31 16:59:29,598] A new study created in memory with name: no-name-498b92dc-a3bb-4c4c-83ce-47074b09dfc4
[W 2024-10-31 16:59:37,842] Trial 0 failed with parameters: {'depth': 4, 'learning_rate': 0.12168668368848211, 'l2_leaf_reg': 0.023705980577632266, 'border_count': 38, 'bagging_temperature': 0.6799535524428815} because of the following error: NameError("name 'aucaa' is not defined").
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_17/2184984520.py", line 33, in cat_objective
    return aucaa
NameError: name 'aucaa' is not defined
[W 2024-10-31 16:59:37,847] Trial 0 failed with value None.


name 'aucaa' is not defined


In [None]:
cat_best_params = cat_study.best_trial.params
cat_best_score = cat_study.best_trial.value
cat_best_score

No trials are completed yet.


## LightGBM

In [1]:
# Objective function for LightGBM

def lgb_objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),  # Adjusted num_leaves range
        'max_depth': trial.suggest_int('max_depth', 3, 10),      # Reduced max_depth
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),  # Reduced min_child_samples
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 0.2),  # Added min_gain_to_split
        'verbosity': 0
    }

    lgb_model = lgb.LGBMClassifier(**param)

    # Train the model with early stopping
    lgb_model.fit(X_train_prep, y_train,
                  eval_set=[(X_test_prep, y_test)],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(stopping_rounds=100)])

    # Predict and evaluate AUC
    y_pred_prob = lgb_model.predict_proba(X_test_prep)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)

    return auc


# Optimize the objective function
lgb_study = optuna.create_study(direction='maximize')
lgb_study.optimize(lgb_objective, n_trials=500, timeout=3600*2)

# Best parameters and AUC
print("Best LGBM Params: ", lgb_study.best_trial.params)
print("Best AUC for LGBM: ", lgb_study.best_value)


KeyboardInterrupt



In [18]:
lgb_best_params = lgb_study.best_trial.params
lgb_best_score = lgb_study.best_value
lgb_best_score

0.957882750074142

## ExtraTreesClassifier

In [19]:
from sklearn.ensemble import ExtraTreesClassifier

def extratrees_objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model = ExtraTreesClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train, cv=3, scoring='roc_auc').mean()

    return score

extratrees_study = optuna.create_study(direction='maximize')
extratrees_study.optimize(extratrees_objective, n_trials=200, timeout=1800)

print('Best parameters for ExtraTrees:', extratrees_study.best_params)

[I 2024-10-31 18:39:22,949] A new study created in memory with name: no-name-0e0e5b12-8e54-4d4e-9a9a-98d9d50cb57d
[I 2024-10-31 18:42:28,069] Trial 0 finished with value: 0.9379868765478628 and parameters: {'n_estimators': 353, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.5982092428135277}. Best is trial 0 with value: 0.9379868765478628.
[I 2024-10-31 18:43:51,163] Trial 1 finished with value: 0.9184487354378609 and parameters: {'n_estimators': 243, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 0.8068286675599466}. Best is trial 0 with value: 0.9379868765478628.
[I 2024-10-31 18:44:20,416] Trial 2 finished with value: 0.927020325371489 and parameters: {'n_estimators': 119, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 0.35468499038011336}. Best is trial 0 with value: 0.9379868765478628.
[I 2024-10-31 18:45:37,599] Trial 3 finished with value: 0.9261044979740927 and parameters: {'n_es

Best parameters for ExtraTrees: {'n_estimators': 446, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 0.8333350015175908}


In [20]:
extratrees_best_params = extratrees_study.best_params
extratrees_best_score = extratrees_study.best_value
extratrees_best_score

0.9385991451247291

## HistGradientBoostingClassifier

In [21]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

def histgb_objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 50),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-5, 1.0, log=True)
    }

    model = HistGradientBoostingClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train, cv=3, scoring='roc_auc').mean()

    return score

histgb_study = optuna.create_study(direction='maximize')
histgb_study.optimize(histgb_objective, n_trials=200, timeout=1800)
print('Best parameters for HistGradientBoosting:', histgb_study.best_params)

[I 2024-10-31 19:14:28,371] A new study created in memory with name: no-name-5016838b-cdef-455a-9277-ba084a6dff24
[I 2024-10-31 19:14:46,483] Trial 0 finished with value: 0.9502664544773637 and parameters: {'learning_rate': 0.047616203242832236, 'max_iter': 471, 'max_depth': 7, 'min_samples_leaf': 12, 'max_leaf_nodes': 19, 'l2_regularization': 1.9930579832794328e-05}. Best is trial 0 with value: 0.9502664544773637.
[I 2024-10-31 19:15:10,065] Trial 1 finished with value: 0.9489651136703233 and parameters: {'learning_rate': 0.02532349425624152, 'max_iter': 338, 'max_depth': 12, 'min_samples_leaf': 26, 'max_leaf_nodes': 23, 'l2_regularization': 0.007037708421238262}. Best is trial 0 with value: 0.9502664544773637.
[I 2024-10-31 19:15:31,144] Trial 2 finished with value: 0.9433762966497641 and parameters: {'learning_rate': 0.015715675740876622, 'max_iter': 222, 'max_depth': 17, 'min_samples_leaf': 30, 'max_leaf_nodes': 35, 'l2_regularization': 2.293959660634556e-05}. Best is trial 0 with 

Best parameters for HistGradientBoosting: {'learning_rate': 0.06500114925960461, 'max_iter': 456, 'max_depth': 17, 'min_samples_leaf': 41, 'max_leaf_nodes': 34, 'l2_regularization': 0.002846689063034667}


In [22]:
histgb_best_params = histgb_study.best_trial.params
histgb_best_score = histgb_study.best_trial.value
histgb_best_score

0.9513325161803893

## KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier

def knn_objective(trial):
    param = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 15),
        'leaf_size': trial.suggest_int('leaf_size', 20, 50),
        'p': trial.suggest_categorical('p', [1, 2]),  # Minkowski distance parameter
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])
    }

    model = KNeighborsClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train, cv=3, scoring='roc_auc').mean()

    return score

knn_study = optuna.create_study(direction='maximize')
knn_study.optimize(knn_objective, n_trials=200, timeout=1800)
print('Best parameters for KNN:', knn_study.best_params)
print('Best score for KNN:', knn_study.best_value)

[I 2024-10-31 19:44:49,865] A new study created in memory with name: no-name-fe7721a1-07a5-4649-8a0b-216f8194c455
[I 2024-10-31 19:45:09,987] Trial 0 finished with value: 0.8704534030249357 and parameters: {'n_neighbors': 5, 'leaf_size': 29, 'p': 2, 'weights': 'distance'}. Best is trial 0 with value: 0.8704534030249357.
[I 2024-10-31 19:47:56,987] Trial 1 finished with value: 0.8813159676698725 and parameters: {'n_neighbors': 7, 'leaf_size': 44, 'p': 1, 'weights': 'uniform'}. Best is trial 1 with value: 0.8813159676698725.
[I 2024-10-31 19:48:17,326] Trial 2 finished with value: 0.885600102529513 and parameters: {'n_neighbors': 9, 'leaf_size': 23, 'p': 2, 'weights': 'distance'}. Best is trial 2 with value: 0.885600102529513.
[I 2024-10-31 19:51:06,787] Trial 3 finished with value: 0.8915004619622682 and parameters: {'n_neighbors': 13, 'leaf_size': 36, 'p': 1, 'weights': 'uniform'}. Best is trial 3 with value: 0.8915004619622682.
[I 2024-10-31 19:53:55,110] Trial 4 finished with value: 

Best parameters for KNN: {'n_neighbors': 15, 'leaf_size': 48, 'p': 1, 'weights': 'uniform'}
Best score for KNN: 0.892947823786483


In [24]:
knn_best_params = knn_study.best_trial.params
knn_best_score = knn_study.best_trial.value
knn_best_score

0.892947823786483

# Stacking

In [25]:
from sklearn.ensemble import StackingClassifier

# Init base models

xgb_model = xgb.XGBClassifier(**xgb_best_params)
try:
    cat_model = CatBoostClassifier(**cat_best_params)
except Exception as e:
    print(e)
    
lgb_model = lgb.LGBMClassifier(**lgb_best_params)
extratrees_model = ExtraTreesClassifier(**extratrees_best_params)
# logreg_model = LogisticRegression(**logreg_best_params)
histgb_model = HistGradientBoostingClassifier(**histgb_best_params)
knn_model = KNeighborsClassifier(**knn_best_params)

# Base and meta models
try:
    base_estimators = [
    	('xgb', xgb_model),
    	('cat', cat_model),
    	('lgb', lgb_model),
    	('extratrees', extratrees_model),
    	('histgb', histgb_model),
    	('knn', knn_model)
    ]
except:
    base_estimators = [
    	('xgb', xgb_model),
    	# ('cat', cat_model),
    	('lgb', lgb_model),
    	('extratrees', extratrees_model),
    	('histgb', histgb_model),
    	('knn', knn_model)
    ]
    
meta_model = lgb.LGBMClassifier()

# Stack

stack_model = StackingClassifier(estimators=base_estimators, final_estimator=meta_model, cv=3, n_jobs=-1)

stack_model.fit(X_train_prep, y_train)

name 'cat_best_params' is not defined
[LightGBM] [Info] Number of positive: 11322, number of negative: 57092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5123
[LightGBM] [Info] Number of data points in the train set: 68414, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165492 -> initscore=-1.617916
[LightGBM] [Info] Start training from score -1.617916
[LightGBM] [Info] Number of positive: 7548, number of negative: 38061
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5121
[LightGBM] [Info] Number of data points in the train set: 45609, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165494 -> initscore=-1.617908
[LightGBM] [Info] Start train

In [26]:
print('Stacking Classifier ROC AUC:', roc_auc_score(y_test, stack_model.predict_proba(X_test_prep)[:, 1]))

Stacking Classifier ROC AUC: 0.9570040489738689


# Submission

In [27]:
X_train_all = preprocessor.transform(X)

stack_model = StackingClassifier(estimators=base_estimators, final_estimator=meta_model, cv=5, n_jobs=-1)

stack_model.fit(X_train_all, y)

[LightGBM] [Info] Number of positive: 14152, number of negative: 71366
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5140
[LightGBM] [Info] Number of data points in the train set: 85518, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165486 -> initscore=-1.617966
[LightGBM] [Info] Start training from score -1.617966
[LightGBM] [Info] Number of positive: 11322, number of negative: 57092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5137
[LightGBM] [Info] Number of data points in the train set: 68414, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165492 -> initscore=-1.617916
[LightGBM] [Info] Start training from score -1.617916
[LightGBM] [

In [28]:
import joblib

try:
    joblib.dump(stack_model, '../models/stack_model_FeatEng.pkl')
except:
    joblib.dump(stack_model, 'stack_model_FeatEng.pkl')
    
X_to_pred = preprocessor.transform(data_to_predict)
y_pred_submit = stack_model.predict_proba(X_to_pred)[:, 1]

submission = pd.DataFrame({'id': data_to_predict['id'], 'loan_status': y_pred_submit})
try:
    submission.to_csv('../submissions/stack_model_FeatEng.csv', index=False)
except:
    submission.to_csv('stack_model_FeatEng.csv', index=False)

