In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../data/train.csv')
data_to_predict = pd.read_csv('../data/test.csv')

In [3]:
X = data.drop(['loan_status'], axis=1)
y = data['loan_status']

# Preprocessing

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [6]:
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('id')

In [8]:
categorical_ordinal = ['loan_grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns)
	])

preprocessor.fit(X_train)

In [10]:
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# XGBoost 1

In [11]:
import xgboost as xgb

In [12]:
dtrain = xgb.DMatrix(X_train_prep, label=y_train)
dtest = xgb.DMatrix(X_test_prep, label=y_test)

In [13]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist = [(dtest, 'evals'), (dtrain, 'train')]

In [17]:
num_round = 200
bst = xgb.train(param, dtrain, num_round, evals=evallist, early_stopping_rounds=10)

[0]	evals-auc:0.85846	train-auc:0.84970
[1]	evals-auc:0.90374	train-auc:0.89342
[2]	evals-auc:0.92150	train-auc:0.91051
[3]	evals-auc:0.92449	train-auc:0.91532
[4]	evals-auc:0.92221	train-auc:0.91635
[5]	evals-auc:0.92655	train-auc:0.92122
[6]	evals-auc:0.92911	train-auc:0.92333
[7]	evals-auc:0.92905	train-auc:0.92358
[8]	evals-auc:0.92944	train-auc:0.92476
[9]	evals-auc:0.93008	train-auc:0.92546
[10]	evals-auc:0.93080	train-auc:0.92721
[11]	evals-auc:0.93221	train-auc:0.92797
[12]	evals-auc:0.93658	train-auc:0.93142
[13]	evals-auc:0.93596	train-auc:0.93186
[14]	evals-auc:0.93878	train-auc:0.93452
[15]	evals-auc:0.94216	train-auc:0.93699
[16]	evals-auc:0.94217	train-auc:0.93766
[17]	evals-auc:0.94211	train-auc:0.93773
[18]	evals-auc:0.94391	train-auc:0.93873
[19]	evals-auc:0.94569	train-auc:0.94089
[20]	evals-auc:0.94699	train-auc:0.94272
[21]	evals-auc:0.94988	train-auc:0.94520
[22]	evals-auc:0.95041	train-auc:0.94554
[23]	evals-auc:0.95024	train-auc:0.94565
[24]	evals-auc:0.95080	tra

## Cross Validation

In [18]:
cv_results = xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, early_stopping_rounds=10)

In [19]:
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.849701,0.001215,0.849698,0.004915
1,0.893425,0.001488,0.893406,0.005946
2,0.910265,0.002157,0.910567,0.005913
3,0.913823,0.002150,0.913497,0.007406
4,0.915833,0.001926,0.914388,0.006349
...,...,...,...,...
105,0.961658,0.000920,0.952122,0.002575
106,0.961728,0.000946,0.952208,0.002496
107,0.961838,0.000949,0.952171,0.002550
108,0.961955,0.000972,0.952240,0.002540


## Hyperparameter Tuning

## 1st Try

GridSearchCV:
```python
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
	}

Best_Parameters = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best_AUC_Score = 0.9528388314968684
```

## 2nd Try

GridSearchCV:

```python
param_grid = {
    'max_depth': [5, 7],
    'learning_rate': [0.1, 0.2, 0.5],
    'n_estimators': [200, 300, 500],
	'subsample': [0.8],
    'colsample_bytree': [0.8],             
    'gamma': [0, 0.1, 0.3],                     
}

Best_Parameters = {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
Best_AUC_Score = 0.9543454696625759
```

## 3rd Try

RandomizedSearchCV:

```python
param_dist = {
	'max_depth': np.arange(3, 12),
	'learning_rate': np.linspace(0.01, 0.3, 20),
	'n_estimators': np.arange(300, 600, 25),
	'subsample': np.linspace(0.6, 1.0, 10),
	'colsample_bytree': np.linspace(0.5, 1.0, 10),
	'gamma': np.linspace(0, 0.3, 20),
	'min_child_weight': np.arange(1, 9),
}

Best_params = {
	'subsample': 0.9111111111111111,
	'n_estimators': 525,
	'min_child_weight': 5,
	'max_depth': 5,
	'learning_rate': 0.07105263157894737,
	'gamma': 0.3,
	'colsample_bytree': 0.5
	}

Best_AUC_score =  0.9532664601824515
```

In [20]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [21]:
param_grid = {
    'max_depth': [4, 5, 6],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [250, 300, 350],
	'subsample': [0.8],
    'colsample_bytree': [0.8],             
    'gamma': [0, 0.02, 0.05],                     
}

xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc')

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')

In [22]:
grid_search.fit(X_train_prep, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [23]:
grid_search.best_params_, grid_search.best_score_

({'colsample_bytree': 0.8,
  'gamma': 0.05,
  'learning_rate': 0.05,
  'max_depth': 6,
  'n_estimators': 350,
  'subsample': 0.8},
 0.9524521603943435)