In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../src")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle

from sklearn.model_selection import train_test_split
from sklearn import linear_model, tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from data_process import load_data, data_preprocessor
from testing import test_model


### Loading the data

In [3]:
file_path = "../data/accepted_2007_to_2018Q4.csv"

data = load_data(file_path)

len(data)

1382382

In [4]:
data = data.sample(frac=0.1, random_state=10)
len(data)

138238

In [5]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

### Preprocessing the data

In [6]:
preprocessor = data_preprocessor(X_train_df)

X_train = preprocessor.fit_transform(X_train_df)
X_test = preprocessor.transform(X_test_df)

### Initializing the models

In [7]:
# Define the XGBoost model
xgb_model = xgb.XGBClassifier()

# Define the hyperparameter space to search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.5, 1],
    # 'n_estimators': [50, 100, 200],
    # 'gamma': [0, 0.1, 0.5],
    # 'subsample': [0.5, 0.8, 1],
    # 'colsample_bytree': [0.5, 0.8, 1],
    # 'reg_alpha': [0, 0.1, 0.5],
    # 'reg_lambda': [0, 0.1, 0.5]
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(xgb_model, param_grid, verbose=1, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best hyperparameters: {'learning_rate': 1, 'max_depth': 3}
Best score: 0.9986087498661481


### Testing the models

#### On the training set

In [8]:
model = grid_search

In [10]:
model_res = test_model(model, X_train, y_train)
model_res

{'accuracy': 1.0,
 'sensitivity': np.float64(1.0),
 'specificity': np.float64(1.0),
 'AUC': np.float64(1.0)}

#### On the testing set

In [11]:

model_res = test_model(model, X_test, y_test)
model_res

{'accuracy': 0.9985532407407407,
 'sensitivity': np.float64(0.9941672067401166),
 'specificity': np.float64(0.9998137455764574),
 'AUC': np.float64(0.9999687362957242)}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# ... (same as above)

# Define the hyperparameter space to search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.5, 1],
    'n_estimators': [50, 100, 200],
    # ...
}

# Perform random search with 5-fold cross-validation
random_search = RandomizedSearchCV(xgb_model, param_grid, cv=5, scoring='f1_macro', n_iter=10)
random_search.fit(X, y)

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)