In [2]:
import numpy as np
import xgboost as xgb
import pandas as pd
# import category_encoders as ce
import sklearn
# from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.utils.multiclass import unique_labels
# import csv


In [96]:
train_data = pd.read_csv('../data/LOANS_TRAIN.csv')
test_data = pd.read_csv('../data/LOANS_TEST.csv')

id_column = train_data

train_data.drop(columns=['id','grade', 'emp_title', 'title'], axis=1, inplace=True)
test_data.drop(columns=['id','grade', 'emp_title', 'title'], axis=1, inplace=True)

labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
train_data['sub_grade'] = labelencoder.fit_transform(train_data['sub_grade'])

# Strip percent(%) from int_rate
train_data['int_rate'] = train_data['int_rate'].str.rstrip('%').astype(float)
test_data['int_rate'] = test_data['int_rate'].str.rstrip('%').astype(float)

#Strip percent(%) from revol_util
train_data['revol_util'] = train_data['revol_util'].str.rstrip('%').astype(float)
test_data['revol_util'] = test_data['revol_util'].str.rstrip('%').astype(float)

X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,:]

X_train_numeric = X_train.select_dtypes(include=np.number)
X_test_numeric = X_train.select_dtypes(include=np.number)
# y_train_numeric = y_train.select_dtypes(include=np.number)
y_train_numeric = y_train.copy(deep=False)
y_train_numeric.replace('Fully Paid', 0.0, inplace=True)
y_train_numeric.replace('Charged Off', 1.0, inplace=True)


In [93]:
X_train.head()

Unnamed: 0,loan_amnt,term_(months),int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,...,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies
0,5000,36,10.65,162.87,6,10+ years,RENT,24000.0,Verified,Dec-2011,...,Jan-1985,3,0,13648,83.7,9,f,Individual,,0.0
1,2500,60,15.27,59.83,13,< 1 year,RENT,30000.0,Source Verified,Dec-2011,...,Apr-1999,3,0,1687,9.4,4,f,Individual,,0.0
2,2400,36,15.96,84.33,14,10+ years,RENT,12252.0,Not Verified,Dec-2011,...,Nov-2001,2,0,2956,98.5,10,f,Individual,,0.0
3,10000,36,13.49,339.31,10,10+ years,RENT,49200.0,Source Verified,Dec-2011,...,Feb-1996,10,0,5598,21.0,37,f,Individual,,0.0
4,3000,60,12.69,67.79,9,1 year,RENT,80000.0,Source Verified,Dec-2011,...,Jan-1996,15,0,27783,53.9,38,f,Individual,,0.0


In [97]:
xg_train, xg_test, xg_ytrain, xg_ytest = sklearn.model_selection.train_test_split(
    X_train_numeric, y_train_numeric, test_size=0.2, random_state=0)
print(xg_train.shape, xg_test.shape, xg_ytrain.shape, xg_ytest.shape)

(157800, 14) (39450, 14) (157800,) (39450,)


In [98]:
def auc(m, xtrain, xtest): 
    return (metrics.roc_auc_score(xg_ytrain,m.predict_proba(xtrain)[:,1], average='weighted'),
                            metrics.roc_auc_score(xg_ytest,m.predict_proba(xtest)[:,1], average='weighted'))

import warnings
warnings.filterwarnings("ignore")

# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [5, 6, 8],
              "min_child_weight" : [5, 7, 12],
              "n_estimators": [2],
              "learning_rate": [0.01, .05],
              "gamma" : [5]}
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1)
grid_search.fit(xg_train, xg_ytrain)

print(grid_search.best_estimator_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV 1/3; 2/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=2[CV 3/3; 1/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=2

[CV 1/3; 1/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=2
[CV 1/3; 3/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=12, n_estimators=2
[CV 3/3; 2/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=2
[CV 2/3; 1/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=2
[CV 2/3; 3/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=12, n_estimators=2
[CV 2/3; 2/18] START gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=2




[CV 1/3; 1/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.9s
[CV 2/3; 1/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=2;, score=0.846 total time=   0.9s
[CV 3/3; 1/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.9s
[CV 2/3; 3/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=12, n_estimators=2;, score=0.846 total time=   0.9s
[CV 1/3; 3/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=12, n_estimators=2;, score=0.847 total time=   1.0s
[CV 2/3; 2/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=2;, score=0.846 total time=   1.0s
[CV 1/3; 2/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=2;, score=0.847 total time=   1.0s
[CV 3/3; 2/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=2



[CV 1/3; 5/18] START gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=2
[CV 2/3; 5/18] START gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=2
[CV 3/3; 5/18] START gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=2
[CV 1/3; 6/18] START gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=12, n_estimators=2




[CV 3/3; 3/18] END gamma=5, learning_rate=0.01, max_depth=5, min_child_weight=12, n_estimators=2;, score=0.847 total time=   0.5s
[CV 2/3; 6/18] START gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=12, n_estimators=2




[CV 1/3; 4/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.8s
[CV 2/3; 4/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=2;, score=0.846 total time=   0.8s
[CV 3/3; 6/18] START gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=12, n_estimators=2
[CV 3/3; 4/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.9s
[CV 1/3; 5/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=2;, score=0.847 total time=   0.9s
[CV 1/3; 7/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=5, n_estimators=2
[CV 2/3; 5/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=2;, score=0.846 total time=   0.8s




[CV 3/3; 5/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=7, n_estimators=2;, score=0.847 total time=   0.8s
[CV 1/3; 6/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=12, n_estimators=2;, score=0.847 total time=   0.8s
[CV 2/3; 6/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=12, n_estimators=2;, score=0.846 total time=   0.7s
[CV 2/3; 7/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=5, n_estimators=2




[CV 3/3; 6/18] END gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=12, n_estimators=2;, score=0.847 total time=   0.4s
[CV 3/3; 7/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=5, n_estimators=2
[CV 1/3; 8/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=7, n_estimators=2
[CV 2/3; 8/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=7, n_estimators=2
[CV 3/3; 8/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=7, n_estimators=2




[CV 1/3; 9/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=12, n_estimators=2
[CV 2/3; 9/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=12, n_estimators=2
[CV 1/3; 7/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=5, n_estimators=2;, score=0.845 total time=   0.8s
[CV 3/3; 9/18] START gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=12, n_estimators=2




[CV 2/3; 7/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=5, n_estimators=2;, score=0.844 total time=   1.1s
[CV 1/3; 10/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=2




[CV 3/3; 7/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=5, n_estimators=2;, score=0.845 total time=   1.2s
[CV 1/3; 8/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=7, n_estimators=2;, score=0.845 total time=   1.2s
[CV 2/3; 8/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=7, n_estimators=2;, score=0.845 total time=   1.2s
[CV 3/3; 8/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=7, n_estimators=2;, score=0.845 total time=   1.2s
[CV 1/3; 9/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=12, n_estimators=2;, score=0.845 total time=   1.1s
[CV 2/3; 9/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=12, n_estimators=2;, score=0.845 total time=   1.1s
[CV 3/3; 9/18] END gamma=5, learning_rate=0.01, max_depth=8, min_child_weight=12, n_estimators=2;, score=0.846 total time=   0.9s
[CV 1/3; 10/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators



[CV 1/3; 11/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=7, n_estimators=2
[CV 2/3; 11/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=7, n_estimators=2
[CV 3/3; 11/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=7, n_estimators=2
[CV 1/3; 12/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=12, n_estimators=2




[CV 3/3; 12/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=12, n_estimators=2
[CV 2/3; 12/18] START gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=12, n_estimators=2
[CV 2/3; 10/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=2;, score=0.846 total time=   0.6s
[CV 1/3; 13/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=2
[CV 3/3; 10/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.7s




[CV 1/3; 11/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=7, n_estimators=2;, score=0.847 total time=   0.7s
[CV 2/3; 13/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=2
[CV 2/3; 11/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=7, n_estimators=2;, score=0.846 total time=   0.8s




[CV 3/3; 11/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=7, n_estimators=2;, score=0.847 total time=   0.8s
[CV 2/3; 12/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=12, n_estimators=2;, score=0.846 total time=   0.7s
[CV 1/3; 12/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=12, n_estimators=2;, score=0.847 total time=   0.8s
[CV 3/3; 12/18] END gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=12, n_estimators=2;, score=0.847 total time=   0.7s
[CV 3/3; 13/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=2
[CV 1/3; 14/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=7, n_estimators=2
[CV 1/3; 13/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.6s
[CV 2/3; 13/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=2;, score=0.846 total time=   0.4s
[CV 2/3; 14/18] STA



[CV 1/3; 15/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=12, n_estimators=2
[CV 3/3; 13/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=2;, score=0.847 total time=   0.4s
[CV 2/3; 15/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=12, n_estimators=2
[CV 3/3; 15/18] START gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=12, n_estimators=2
[CV 1/3; 16/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=2




[CV 2/3; 16/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=2




[CV 1/3; 14/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=7, n_estimators=2;, score=0.847 total time=   0.8s
[CV 3/3; 16/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=2
[CV 2/3; 14/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=7, n_estimators=2;, score=0.846 total time=   1.0s




[CV 3/3; 14/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=7, n_estimators=2;, score=0.847 total time=   1.1s
[CV 1/3; 17/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=7, n_estimators=2
[CV 1/3; 15/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=12, n_estimators=2;, score=0.847 total time=   1.1s
[CV 2/3; 15/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=12, n_estimators=2;, score=0.846 total time=   1.1s
[CV 2/3; 17/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=7, n_estimators=2




[CV 3/3; 15/18] END gamma=5, learning_rate=0.05, max_depth=6, min_child_weight=12, n_estimators=2;, score=0.847 total time=   1.1s
[CV 3/3; 17/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=7, n_estimators=2
[CV 1/3; 18/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=12, n_estimators=2




[CV 2/3; 18/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=12, n_estimators=2
[CV 1/3; 16/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=2;, score=0.846 total time=   1.4s
[CV 2/3; 16/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=2;, score=0.845 total time=   1.4s
[CV 3/3; 18/18] START gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=12, n_estimators=2




[CV 3/3; 16/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=2;, score=0.845 total time=   1.3s
[CV 1/3; 17/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=7, n_estimators=2;, score=0.846 total time=   1.1s
[CV 2/3; 17/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=7, n_estimators=2;, score=0.846 total time=   1.1s
[CV 1/3; 18/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=12, n_estimators=2;, score=0.846 total time=   1.0s
[CV 3/3; 17/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=7, n_estimators=2;, score=0.845 total time=   1.0s
[CV 2/3; 18/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=12, n_estimators=2;, score=0.846 total time=   0.9s
[CV 3/3; 18/18] END gamma=5, learning_rate=0.05, max_depth=8, min_child_weight=12, n_estimators=2;, score=0.846 total time=   0.7s
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              cols

In [100]:
model = xgb.XGBClassifier(max_depth=6, min_child_weight=1,  n_estimators=8,
                          n_jobs=-1, verbose=1,learning_rate=0.01, gamma=0)
model.fit(xg_train,xg_ytrain)
predictions = model.predict_proba(xg_test)[:,1]

print(auc(model, xg_train, xg_test))


Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


(0.6890205241237751, 0.6737871972299316)
