In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
data = pd.read_csv('../data/train.csv')
data_to_predict = pd.read_csv('../data/test.csv')

In [16]:
X = data.drop(['loan_status'], axis=1)
y = data['loan_status']

# Preprocessing

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [19]:
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('id')

In [20]:
categorical_ordinal = ['loan_grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns)
	])

preprocessor.fit(X_train)

In [22]:
X_train_prep = preprocessor.transform(X_train)

In [23]:
X_test_prep = preprocessor.transform(X_test)

# XGBoost 1

In [11]:
import xgboost as xgb

In [12]:
dtrain = xgb.DMatrix(X_train_prep, label=y_train)
dtest = xgb.DMatrix(X_test_prep, label=y_test)

In [81]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist = [(dtest, 'evals'), (dtrain, 'train')]

In [82]:
num_round = 200
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

[0]	evals-auc:0.84874	train-auc:0.85211
[1]	evals-auc:0.89404	train-auc:0.89603
[2]	evals-auc:0.91506	train-auc:0.91204
[3]	evals-auc:0.91151	train-auc:0.90937
[4]	evals-auc:0.91567	train-auc:0.91377
[5]	evals-auc:0.92012	train-auc:0.91799
[6]	evals-auc:0.92433	train-auc:0.92331
[7]	evals-auc:0.92531	train-auc:0.92412




[8]	evals-auc:0.92577	train-auc:0.92468
[9]	evals-auc:0.92615	train-auc:0.92539
[10]	evals-auc:0.92894	train-auc:0.92867
[11]	evals-auc:0.93016	train-auc:0.92975
[12]	evals-auc:0.93167	train-auc:0.93232
[13]	evals-auc:0.93221	train-auc:0.93316
[14]	evals-auc:0.93251	train-auc:0.93337
[15]	evals-auc:0.93540	train-auc:0.93648
[16]	evals-auc:0.93587	train-auc:0.93739
[17]	evals-auc:0.93658	train-auc:0.93788
[18]	evals-auc:0.93690	train-auc:0.93849
[19]	evals-auc:0.93832	train-auc:0.94010
[20]	evals-auc:0.93911	train-auc:0.94100
[21]	evals-auc:0.93925	train-auc:0.94135
[22]	evals-auc:0.94027	train-auc:0.94225
[23]	evals-auc:0.94087	train-auc:0.94260
[24]	evals-auc:0.94117	train-auc:0.94300
[25]	evals-auc:0.94142	train-auc:0.94359
[26]	evals-auc:0.94217	train-auc:0.94404
[27]	evals-auc:0.94431	train-auc:0.94584
[28]	evals-auc:0.94504	train-auc:0.94679
[29]	evals-auc:0.94493	train-auc:0.94705
[30]	evals-auc:0.94497	train-auc:0.94722
[31]	evals-auc:0.94600	train-auc:0.94822
[32]	evals-auc:0.9

## Cross Validation

In [83]:
cv_results = xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, early_stopping_rounds=10)

In [84]:
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.852115,0.001657,0.852133,0.006689
1,0.896010,0.001020,0.895922,0.004115
2,0.909599,0.005579,0.908656,0.004375
3,0.911986,0.002143,0.910645,0.004066
4,0.915743,0.001904,0.914177,0.003974
...,...,...,...,...
122,0.964160,0.001351,0.953903,0.001849
123,0.964199,0.001349,0.954030,0.001753
124,0.964273,0.001387,0.954096,0.001783
125,0.964377,0.001392,0.954260,0.001760


## Hyperparameter Tuning

## 1st Try

GridSearchCV:
```python
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
	}

Best_Parameters = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best_AUC_Score = 0.9528388314968684
```

## 2nd Try

GridSearchCV:

```python
param_grid = {
    'max_depth': [5, 7],
    'learning_rate': [0.1, 0.2, 0.5],
    'n_estimators': [200, 300, 500],
	'subsample': [0.8],
    'colsample_bytree': [0.8],             
    'gamma': [0, 0.1, 0.3],                     
}

Best_Parameters = {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
Best_AUC_Score = 0.9543454696625759
```

## 3rd Try

RandomizedSearchCV:

```python
param_dist = {
	'max_depth': np.arange(3, 12),
    'learning_rate': np.linspace(0.01, 0.3, 20),
    'n_estimators': np.arange(300, 600, 25),
    'subsample': np.linspace(0.6, 1.0, 10),
    'colsample_bytree': np.linspace(0.5, 1.0, 10),
    'gamma': np.linspace(0, 0.3, 20),
    'min_child_weight': np.arange(1, 9),
}

Best_params = {
	'subsample': 0.9111111111111111,
	'n_estimators': 525,
	'min_child_weight': 5,
	'max_depth': 5,
	'learning_rate': 0.07105263157894737,
	'gamma': 0.3,
	'colsample_bytree': 0.5
	}

Best_AUC_score =  0.9532664601824515
```