<a href="https://colab.research.google.com/github/Mayada98/Useful_DataScience/blob/main/XGBoost_Hyperparameter_optimization_using_hypopt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
from hypopt import GridSearch
from xgboost import XGBRegressor

#### 1. load the data, apply WOE and use it for modeling

In [6]:
data = pd.read_csv('readydata.csv')
droplist = ['total_pymnt','total_pymnt_inv','total_rec_late_fee','recoveries', 'collection_recovery_fee','last_pymnt_amnt']
data = data.drop(droplist,axis=1)

In [7]:
data.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,inq_last_6mths,pub_rec,revol_util,total_acc,fico_average,term,grade,target
0,5000.0,5000.0,4975.0,10.65,24000.0,27.65,1.0,0.0,83.7,9.0,737.0,0.741507,2,1
1,2500.0,2500.0,2500.0,15.27,30000.0,1.0,5.0,0.0,9.4,4.0,742.0,0.258493,3,0
2,2400.0,2400.0,2400.0,15.96,12252.0,8.72,2.0,0.0,98.5,10.0,737.0,0.741507,3,1
3,10000.0,10000.0,10000.0,13.49,49200.0,20.0,1.0,0.0,21.0,37.0,692.0,0.741507,3,1
4,5000.0,5000.0,5000.0,7.9,36000.0,11.2,3.0,0.0,28.3,12.0,732.0,0.741507,1,1


In [8]:
def encode(df, target , method = "WoE", index=None):
    if method == 'WoE':
        X = df.drop(target, axis = 1)
        for var in X:
            df[var] = df[var].fillna('missing')
            k = df[[var,target]].groupby(var)[target].agg(['count','sum']).reset_index()
            k.columns = [var,'Count','Good']
            k['Bad'] = k['Count'] - k['Good']
            k['Good %'] = (k['Good'] / k['Good'].sum()*100).round(2)
            k['Bad %'] = (k['Bad'] / k['Bad'].sum()*100).round(2)
            k[var+'_WOE'] = np.log(k['Good %'] / k['Bad %']).round(2)
            k = k.sort_values(by=var+'_WOE')
            dictionary = dict(zip(k[var], k[var+'_WOE']))
            new_dict = {var : dictionary}
            df = df.replace(new_dict)

    elif method == 'target_enc':
        X = df.drop(target, axis = 1)
        index = df.index if index is None else index # Encode the entire input df if no specific indices is supplied

        for var in X:
            k = df.iloc[index].groupby(var)[target].mean().reset_index()
            k.columns = [var, var+'_Mean']
            k = k.sort_values(by=var+'_Mean')
            dictionary = dict(zip(k[var], k[var+'_Mean']))
            new_dict = {var : dictionary}
            df = df.replace(new_dict)
    
    return df

#### 2.Splitting data for modeling

In [9]:
X = data.drop('target', axis=1)
Y = data.target 
X_shuffled = X.sample(frac=1)
y_shuffled = Y[X_shuffled.index]

X_train, X_validation, X_test = np.split(X_shuffled, [int(0.6*len(X)),int(0.8*len(X))])
y_train, y_validation, y_test = np.split(y_shuffled, [int(0.6*len(X)),int(0.8*len(X))])

#### 3. Define model and GridSearch Parameters

In [12]:
model = XGBRegressor(max_depth = 3, min_child_weight = 2, subsample = 0.9, colsample_bytree = 0.6)

# space = {'n_estimators': range(1000,2000,100),
#          'learning_rate': [i/100 for i in range(1,10,2)],
#          'max_depth': range(3,10,2),
#          'subsample': [i/100 for i in range(6,10,1)],
#          'colsample_bytree': [i/100 for i in range(6,10,1)]
#          }
space = {'n_estimators': [100,200],
         'learning_rate':[0.01, 0.05],
         'max_depth':[6,7],
         'subsample': [0.01,0.05],
         'colsample_bytree': [0.01,0.05]
         }

opt = GridSearch(model = model, param_grid = space)

In [13]:
# tuning only learning rate and n_estimators
opt.fit(X_train, y_train, [X_validation, y_validation], scoring = 'r2')
print('Best score from GridSearch :',opt.best_score)
print('Best parameters:' ,opt.best_params)

Best score from GridSearch : 0.04095574132704124
Best parameters: {'colsample_bytree': 0.01, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.05}
