In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import warnings


In [2]:
train_data = pd.read_csv('../data/LOANS_TRAIN.csv')
test_data = pd.read_csv('../data/LOANS_TEST.csv')

id_column = train_data

train_data.drop(columns=['id','grade', 'emp_title', 'title'], axis=1, inplace=True)
test_data.drop(columns=['id','grade', 'emp_title', 'title'], axis=1, inplace=True)

labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
train_data['sub_grade'] = labelencoder.fit_transform(train_data['sub_grade'])
train_data['home_ownership'] = labelencoder.fit_transform(train_data['home_ownership'])
train_data['emp_length'].replace('< 1 year', 0.5, inplace=True)
train_data['emp_length'].replace('1 year', 1.0, inplace=True)
train_data['emp_length'].replace('2 years', 2.0, inplace=True)
train_data['emp_length'].replace('3 years', 3.0, inplace=True)
train_data['emp_length'].replace('4 years', 4.0, inplace=True)
train_data['emp_length'].replace('5 years', 5.0, inplace=True)
train_data['emp_length'].replace('6 years', 6.0, inplace=True)
train_data['emp_length'].replace('7 years', 7.0, inplace=True)
train_data['emp_length'].replace('8 years', 8.0, inplace=True)
train_data['emp_length'].replace('9 years', 9.0, inplace=True)
train_data['emp_length'].replace('10 years', 10.0, inplace=True)
train_data['emp_length'].replace('10+ years', 15.0, inplace=True)
train_data['emp_length'] = train_data['emp_length'].fillna(0)

train_data['mort_acc'] = train_data['mort_acc'].fillna(0)
# Strip percent(%) from int_rate
train_data['int_rate'] = train_data['int_rate'].str.rstrip('%').astype(float)
test_data['int_rate'] = test_data['int_rate'].str.rstrip('%').astype(float)

#Strip percent(%) from revol_util
train_data['revol_util'] = train_data['revol_util'].str.rstrip('%').astype(float)
test_data['revol_util'] = test_data['revol_util'].str.rstrip('%').astype(float)

X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,:]

X_train_numeric = X_train.select_dtypes(include=np.number)
X_test_numeric = X_train.select_dtypes(include=np.number)
# y_train_numeric = y_train.select_dtypes(include=np.number)
y_train_numeric = y_train.copy(deep=False)
y_train_numeric.replace('Fully Paid', 0.0, inplace=True)
y_train_numeric.replace('Charged Off', 1.0, inplace=True)


## Adding artificial Data

In [3]:
print(type(X_train_numeric), type(y_train_numeric), type(X_test))

# print("BEFORE")
# print(X_train_numeric.shape)
# print(y_train_numeric.shape)
# print('AFTER')
# print(X_balanced.shape)
# print(Y_balanced.shape)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'>


In [4]:
X_train_numeric.head()

Unnamed: 0,loan_amnt,term_(months),int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies
0,5000,36,10.65,162.87,6,15.0,4,24000.0,27.65,3,0,13648,83.7,9,0.0,0.0
1,2500,60,15.27,59.83,13,0.5,4,30000.0,1.0,3,0,1687,9.4,4,0.0,0.0
2,2400,36,15.96,84.33,14,15.0,4,12252.0,8.72,2,0,2956,98.5,10,0.0,0.0
3,10000,36,13.49,339.31,10,15.0,4,49200.0,20.0,10,0,5598,21.0,37,0.0,0.0
4,3000,60,12.69,67.79,9,1.0,4,80000.0,17.94,15,0,27783,53.9,38,0.0,0.0


In [5]:
# xg_train, xg_test, xg_ytrain, xg_ytest = sklearn.model_selection.train_test_split(
#     X_train_numeric, y_train_numeric, test_size=0.2, random_state=0)

# print(xg_train.shape, xg_test.shape, xg_ytrain.shape, xg_ytest.shape)
# print(type(xg_train), type(xg_test), type(xg_ytrain), type(xg_ytest))

In [6]:
def artificial_data(x_train, y_train):
    X_balanced = []
    Y_balanced = []
    for i in range(x_train.shape[0]):
        curr_df = x_train.iloc[i,:]
        d = curr_df.to_dict()
        label = y_train.iloc[i]
        if label == 1:
            for _ in range(4):
                X_balanced.append(d)
                Y_balanced.append(label)
        X_balanced.append(d)
        Y_balanced.append(label)
    xg_train = pd.DataFrame(X_balanced)
    xg_ytrain = pd.Series(Y_balanced)
    return (xg_train, xg_ytrain)

### Accuracy Measurement

In [7]:
def auc(m, x_train, x_test, y_train, y_test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(x_train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(x_test)[:,1]))

## Running XGBoost

In [8]:
n_folds = 5
kf = KFold(n_folds, shuffle = True)
training_accuracy_scores = []
testing_accuracy_scores = []
for train_idx, test_idx in kf.split(X_train_numeric):
    kf_Xtrain, xg_kf_Xtest = X_train_numeric.iloc[train_idx], X_train_numeric.iloc[test_idx]
    kf_ytrain, xg_kf_ytest = y_train_numeric.iloc[train_idx], y_train_numeric.iloc[test_idx]
    xg_kf_Xtrain, xg_kf_ytrain  = artificial_data(kf_Xtrain, kf_ytrain)
    model = xgb.XGBClassifier(max_depth=7, min_child_weight=1, learning_rate=0.2, subsample=0.95,
                          colsample_bytree=0.95,silent=1, feature_selected=["ohe","lin"],
                          objective='binary:logistic', eval_metric='auc', num_boost_round=36)
    model.fit(xg_kf_Xtrain,xg_kf_ytrain)
    predictions = model.predict_proba(xg_kf_Xtest)[:,1]
    scores = auc(model, xg_kf_Xtrain, xg_kf_Xtest, xg_kf_ytrain, xg_kf_ytest)
    training_accuracy_scores.append(scores[0])
    testing_accuracy_scores.append(scores[1])
print(f'Max training score is {max(training_accuracy_scores)}\n')
print(f'Average training score is {np.average(training_accuracy_scores)}\n')
print(f'Max testing score is {max(testing_accuracy_scores)}\n')
print(f'Average testing score is {np.average(testing_accuracy_scores)}\n')




Parameters: { "feature_selected", "silent", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "feature_selected", "silent", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "feature_selected", "silent", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "feature_selected", "silent", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "feature_selected", "silent", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Max training score is 0.7987879793911465

Average training score is 0.797107131942247

Max testing score is 0.6817740152643275

Average testing score is 0.6774012345300295



In [9]:

# model = xgb.XGBClassifier(max_depth=10, min_child_weight=2,  n_estimators=100,
#                           n_jobs=-1,learning_rate=0.01, gamma=4)
# model.fit(xg_train,xg_ytrain)
# predictions1 = model.predict_proba(xg_test)[:,1]
# predictions2 = model.predict_proba(xg_test)
# print(predictions2)

# print(auc(model, xg_train, xg_test))

### Grid Search for Hyperparameters

In [10]:
# # Parameter Tuning
# model = xgb.XGBClassifier()
# param_dist = {"max_depth": [5, 10, 20, 50],
#               "min_child_weight" : [1, 2, 5, 10],
#               "n_estimators": [20, 50, 100, 200, 300, 500],
#               "learning_rate": [0.01, .02, .03, 0.05, .1],
#               "gamma" : [0,1,4,10],
#               "random_state" : [0, 20, 40, 100]}
# grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
#                                    verbose=10, n_jobs=-1)
# grid_search.fit(xg_train, xg_ytrain)

# print(grid_search.best_estimator_)