# Imports

In [11]:
# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Hyperparameter tuning
import optuna

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC


# Data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import roc_auc_score

# Data preparation

In [10]:
# Data input
data = pd.read_csv('../data/train.csv')
data_to_predict = pd.read_csv('../data/test.csv')
data_ccrisk = pd.read_csv('../data/credit_risk_dataset.csv')

X_old = data.drop(['loan_status'], axis=1)
y_old = data['loan_status']

# Merge the dataframes
data_no_id = data.drop(['id'], axis=1)
merged_data = pd.concat([data_no_id, data_ccrisk], ignore_index=True)

X = merged_data.drop(['loan_status'], axis=1)
y = merged_data['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [12]:
# Data processing

categorical_columns = X.select_dtypes(include=['object']).columns

categorical_ordinal = ['loan_grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessor
preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns)
	])

preprocessor.fit(X_train)

X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [14]:
# Assuming you have X_train_prep, y_train, X_test_prep, y_test ready
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'auc',  # Ensure AUC is the metric being optimized
        'objective': 'binary:logistic'  # Use binary logistic since you want probability outputs
    }

    # Initialize the model with current hyperparameters
    model = xgb.XGBClassifier(**param)

    # Fit the model (using early stopping)
    model.fit(X_train_prep, y_train, 
              eval_set=[(X_test_prep, y_test)],  
              verbose=False)

    # Get the predicted probabilities for the validation set
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]

    # Calculate the AUC score
    auc = roc_auc_score(y_test, y_pred_prob)
    
    return auc

# Create a study object and maximize the objective
study = optuna.create_study(direction='maximize')  # Because we want to maximize AUC
study.optimize(objective, n_trials=50)

# Output the best trial
print(f"Best trial: {study.best_trial.params}")

[I 2024-10-09 21:08:31,030] A new study created in memory with name: no-name-ef6fb6ae-f37b-4b49-abb2-ddc88c5b12f3
[I 2024-10-09 21:08:33,361] Trial 0 finished with value: 0.939108817007827 and parameters: {'max_depth': 3, 'learning_rate': 0.07465517037951955, 'n_estimators': 135, 'subsample': 0.7369438291641859, 'colsample_bytree': 0.8923076783150959, 'gamma': 0.30157194902631745, 'lambda': 1.6268250332604492, 'alpha': 1.2805043154157706e-05, 'scale_pos_weight': 2.6463118946663324}. Best is trial 0 with value: 0.939108817007827.
[I 2024-10-09 21:08:37,561] Trial 1 finished with value: 0.9550097865450714 and parameters: {'max_depth': 4, 'learning_rate': 0.09141033241296674, 'n_estimators': 226, 'subsample': 0.7663718146248113, 'colsample_bytree': 0.6098644847778827, 'gamma': 0.23092627109255753, 'lambda': 0.31921581391424503, 'alpha': 6.129147756852991, 'scale_pos_weight': 2.191348000078599}. Best is trial 1 with value: 0.9550097865450714.
[I 2024-10-09 21:08:43,280] Trial 2 finished wi