# **Hyper Parameter Tuning on XGBClassifer**
1. RandomizedSearchCV
2. GridSearchCV
3. Hyperopt
4. TPOT (Genetic) Classifier
5. Optuna Optimizer


# Importing Libraries

In [None]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark")
plt.style.use('dark_background')
print("Setup Complete")

# Importing Dataset

In [None]:
machine_data = pd.read_csv("../input/machine-failure-predictions/machine failure.csv",index_col="UDI")
machine_data1 = pd.read_csv("../input/playground-series-s3e17/train.csv",index_col="id")
machine_data = machine_data.append(machine_data1)
machine_data.head()

# Data Cleaning

In [None]:
from sklearn.preprocessing import LabelEncoder


machine_data.drop(["Product ID"],axis=1,inplace=True)
machine_data.columns = ['Type', 'Air_Temp_K', 'Process_Temp_K', 'Rot_Speed', 'Torque',
       'Tool_Wear', 'Machine_Failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
catDTypeCols = ['Type']

encoder = LabelEncoder()

machine_data["Type"] = encoder.fit_transform(machine_data["Type"])

machine_data.head()

# Feature Engineering

In [None]:
machine_data["Temp_Diff"] = machine_data["Process_Temp_K"]-machine_data["Air_Temp_K"]
machine_data["Air_Temp_C"] = machine_data["Air_Temp_K"]-273
machine_data["Process_Temp_C"] = machine_data["Process_Temp_K"]-273
machine_data["Power"] = machine_data["Torque"]*machine_data["Rot_Speed"]

# EDA

In [None]:
plotCols = [i for i in machine_data.columns if len(machine_data[i].unique())>3]

for i in plotCols:
    plt.figure(figsize=(6,4))
    sns.histplot(machine_data,x =i,hue="Machine_Failure",bins=40,kde=True,palette="inferno")
    plt.show();

# Training Model

### **XGB Parameters Defaults**

---
-  booster = {*gbtree*, gblinear, dart}
-  verbosity = {0(Silent), 1(Warnings) ,2(Info) ,3(Debug)} 
-  max_depth = 3
-  learning_rate = 0.1
-  n_estimators = 100
-  gamma = 0
-  min_child_weight = 1
-  max_delta_step = 0
-  subsample = 1
-  sampling_method = {*uniform*, gradient_based (only supported in tree_method=gpu_hist)}
-  colsample_bytree = 1
-  colsample_bylevel = 1
-  colsample_bynode = 1
-  reg_alpha = 0 (L1 regularization) 
-  reg_lambda = 1 (L2 regularization) 
-  max_leaves = 0 (means no limit)
-  max_bin = 256
-  predictor = {*auto*, cpu_predictor, gpu_predictor (for tree_method=gpu_hist)}
-  tree_method = {*auto*, approx, hist, gpu_hist}
-  grow_policy = {*depthwise*, lossguide} 
-  eval_metric = {rmse, rmsle,mae, mape, mphe, logloss, error, error@t, merror, mlogloss, auc, aucpr, ndcg,poisson-nloglik, gamma-nloglik, cox-nloglik}
-  n_jobs=1
-  nthread=None
-  scale_pos_weight=1
-  base_score=0.5
-  random_state=0
-  silent = True
-  seed=None
-  missing=None

-  objective { 
 -  *reg:squarederror:* regression with squared loss.
 -  reg:squaredlogerror: regression with squared log loss. All input labels are required to be greater than -1.
 -  reg:logistic: logistic regression.
 -  reg:pseudohubererror: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
 -  reg:absoluteerror: Regression with L1 error.
 -  binary:logistic: logistic regression for binary classification, output probability
 -  binary:logitraw: logistic regression for binary classification, output score before logistic transformation
 -  binary:hinge: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
 -  count:poisson: Poisson regression for count data, output mean of Poisson distribution.max_delta_step = 0.7 by default
 -  survival:cox: Cox regression for right censored survival time data (negative values are considered right censored).
 -  survival:aft: Accelerated failure time model for censored survival time data.
 -  multi:softmax: set XGBoost to do multiclass classification using the softmax objective, you need to set no of classes
 -  multi:softprob: same as softmax, but output a vector of ndata * nclass, which can be reshaped to ndata * nclass matrix.
 -  rank:pairwise: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
 -  rank:ndcg: Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized
 -  rank:map: Use LambdaMART to perform list-wise ranking where Mean Average Precision (MAP) is maximized
 -  reg:gamma: gamma regression with log-link. Output is a mean of gamma distribution.
 -  reg:tweedie: Tweedie regression with log-link.
 }

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from tpot import TPOTClassifier
import optuna

In [None]:
cols = [i for i in machine_data.columns if i!="Machine_Failure"]
seed = np.random.seed(0)

X = machine_data[cols]
y = machine_data["Machine_Failure"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25,random_state=seed)

## 1. Baseline

In [None]:
xgbmodel_base = XGBClassifier(random_state=seed,tree_method = "gpu_hist",eval_metric= "auc")
xgbmodel_base.fit(X_train,y_train)

print(" - Baseline ROC Area Under Curve of XGB:",roc_auc_score(y_test, xgbmodel_base.predict_proba(X_test)[:,1]))
print("\n - Baseline Accuracy of XGB:",accuracy_score(y_test, xgbmodel_base.predict(X_test)))
print("\n - Baseline Conf Matrix of XGB:\n",confusion_matrix(y_test, xgbmodel_base.predict(X_test)))
print("\n - Baseline Classification Report of XGB:\n",classification_report(y_test, xgbmodel_base.predict(X_test)))

## 2. Manual Tuning

In [None]:
xgbmodel_manual = XGBClassifier(n_estimators=200,learning_rate=1.1,random_state=seed,tree_method = "gpu_hist",eval_metric= "auc")
xgbmodel_manual.fit(X_train,y_train)

print(" - Baseline ROC Area Under Curve of XGB:",roc_auc_score(y_test, xgbmodel_manual.predict_proba(X_test)[:,1]))
print("\n - Baseline Accuracy of XGB:",accuracy_score(y_test, xgbmodel_manual.predict(X_test)))
print("\n - Baseline Conf Matrix of XGB:\n",confusion_matrix(y_test, xgbmodel_manual.predict(X_test)))
print("\n - Baseline Classification Report of XGB:\n",classification_report(y_test, xgbmodel_manual.predict(X_test)))

## 2. RandomizedSearchCV 

In [None]:
params = {
    "n_estimators" : [int(x) for x in np.linspace(200,2000)],
    "max_depth" : [i for i in range(2,11)],
    "min_child_weight" : [i for i in range(1,6)],
    "max_delta_step" : [i for i in range(1,6)],
    "learning_rate" : [round(i,2) for i in np.linspace(0.1,2,20)],
    "gamma" : [round(i,2) for i in np.linspace(0,5,11)],
    "subsample" : [round(i,2) for i in np.linspace(0.1,1,10)],
    "colsample_bytree" : [round(i,2) for i in np.linspace(0.1,1,10)],
    "colsample_bylevel" : [round(i,2) for i in np.linspace(0.1,1,10)],
    "colsample_bynode" : [round(i,2) for i in np.linspace(0.1,1,10)]
}

In [None]:
xgbmodel = XGBClassifier(random_state=seed,tree_method = "gpu_hist",eval_metric= "auc")

xgbmodel_rscv = RandomizedSearchCV(estimator=xgbmodel,param_distributions=params,n_iter=50,cv=2,verbose=3,scoring="roc_auc")
xgbmodel_rscv.fit(X_train,y_train)

In [None]:
best_params_rscv = xgbmodel_rscv.best_params_
best_xgb_rscv = xgbmodel_rscv.best_estimator_

print("\n","-"*100,"\n")
print("Best Params :",xgbmodel_rscv.best_params_)

print("\n","-"*100,"\n")
print("Best Evaluator :",xgbmodel_rscv.best_estimator_)

print("\n","-"*100,"\n")
print(" - Baseline ROC Area Under Curve of XGB:",roc_auc_score(y_test, best_xgb_rscv.predict_proba(X_test)[:,1]))
print("\n - Baseline Accuracy of XGB:",accuracy_score(y_test, best_xgb_rscv.predict(X_test)))
print("\n - Baseline Conf Matrix of XGB:\n",confusion_matrix(y_test, best_xgb_rscv.predict(X_test)))
print("\n - Baseline Classification Report of XGB:\n",classification_report(y_test, best_xgb_rscv.predict(X_test)))

## 3. GridSearchCV

In [None]:
params = {'subsample': [1.0,0.6], 
          'n_estimators': [1816,3000],
          'min_child_weight': [4],
          'max_depth': [6,8],
          'max_delta_step': [1],
          'learning_rate': [0.1,0.2],
          'gamma': [2.0],
          'colsample_bytree': [0.1],
          'colsample_bynode': [0.8,0.6],
          'colsample_bylevel': [0.7]
         }

In [None]:
xgbmodel = XGBClassifier(random_state=seed,tree_method = "gpu_hist",eval_metric= "auc")

xgbmodel_gscv = GridSearchCV(estimator=xgbmodel,param_grid=params,cv=3,verbose=3,scoring="roc_auc")
xgbmodel_gscv.fit(X_train,y_train)

In [None]:
best_params_gscv = xgbmodel_gscv.best_params_
best_xgb_gscv = xgbmodel_gscv.best_estimator_

print("\n","-"*100,"\n")
print("Best Params :",xgbmodel_gscv.best_params_)

print("\n","-"*100,"\n")
print("Best Evaluator :",xgbmodel_gscv.best_estimator_)

print("\n","-"*100,"\n")
print(" - Baseline ROC Area Under Curve of XGB:",roc_auc_score(y_test, best_xgb_gscv.predict_proba(X_test)[:,1]))
print("\n - Baseline Accuracy of XGB:",accuracy_score(y_test, best_xgb_gscv.predict(X_test)))
print("\n - Baseline Conf Matrix of XGB:\n",confusion_matrix(y_test, best_xgb_gscv.predict(X_test)))
print("\n - Baseline Classification Report of XGB:\n",classification_report(y_test, best_xgb_gscv.predict(X_test)))

# 4. HyperOpt (Bayesian Automated-HyperParamter Tunning)

- Objective Function = defines the loss function to minimize.
- Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
- Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.


In [None]:
space = {
    'n_estimators' : hp.choice('n_estimators',[int(x) for x in np.linspace(800,2000)]),
    'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
    "learning_rate" : hp.uniform('learning_rate',0.05 , 2),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'gamma': hp.uniform('gamma', 0, 2),
    "colsample_bytree" : hp.uniform('colsample_bytree',0.5,1),
    "colsample_bylevel" : hp.uniform('colsample_bylevel',0.5,1),
    "colsample_bynode" : hp.uniform('colsample_bynode',0.5,1),
    'eval_metric': 'auc',
}



def objective(space):
    model = XGBClassifier(random_state=seed,tree_method = "gpu_hist",eval_metric= "auc",n_estimators = space["n_estimators"],max_depth = space["max_depth"],
              min_child_weight = space["min_child_weight"],learning_rate = space["learning_rate"],
              gamma = space["gamma"],subsample = space["subsample"],colsample_bytree = space["colsample_bytree"],colsample_bylevel = space["colsample_bylevel"],
              colsample_bynode = space["colsample_bynode"])

    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5,scoring='roc_auc').mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }


In [None]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 50,
            trials= trials)
best

In [None]:
# 100%|██████████| 50/50 [07:15<00:00,  8.72s/trial, best loss: -0.969779209585727] 

hypopt_params = {'colsample_bylevel': 0.1,
 'colsample_bynode': 0.1,
 'colsample_bytree': 0.1,
 'gamma': 0.8622315538845127,
 'learning_rate': 0.13454749501702748,
 'max_depth': 2,
 'min_child_weight': 3.0,
 'n_estimators': 36,
 'subsample': 0.9104854458851901
                }

In [None]:
print("\n","-"*100,"\n")
print("Best Params :",hypopt_params)

xgbmodel_hyperopt = XGBClassifier(random_state=seed,tree_method = "gpu_hist",eval_metric= "auc",**hypopt_params)
xgbmodel_hyperopt.fit(X,y)

print("\n","-"*100,"\n")
print(" - Baseline ROC Area Under Curve of XGB:",roc_auc_score(y_test, xgbmodel_hyperopt.predict_proba(X_test)[:,1]))
print("\n - Baseline Accuracy of XGB:",accuracy_score(y_test, xgbmodel_hyperopt.predict(X_test)))
print("\n - Baseline Conf Matrix of XGB:\n",confusion_matrix(y_test, xgbmodel_hyperopt.predict(X_test)))
print("\n - Baseline Classification Report of XGB:\n",classification_report(y_test, xgbmodel_hyperopt.predict(X_test)))

# 5. Genetic Algoritms (TPOT Classifier)

In [None]:
params = {
    "n_estimators" : [int(x) for x in np.linspace(200,2000)],
    "max_depth" : [i for i in range(2,11)],
    "min_child_weight" : [i for i in range(1,6)],
    "max_delta_step" : [i for i in range(1,6)],
    "learning_rate" : [round(i,2) for i in np.linspace(0.1,2,20)],
    "gamma" : [round(i,2) for i in np.linspace(0,5,11)],
    "subsample" : [round(i,2) for i in np.linspace(0.1,1,10)],
    "colsample_bytree" : [round(i,2) for i in np.linspace(0.1,1,10)],
    "colsample_bylevel" : [round(i,2) for i in np.linspace(0.1,1,10)],
    "colsample_bynode" : [round(i,2) for i in np.linspace(0.1,1,10)]
}

In [None]:
tpot_classifier = TPOTClassifier(generations= 5, population_size= 12, offspring_size= 6,
                                 verbosity= 2, early_stop= 10,
                                 config_dict={'xgboost.XGBClassifier': params}, 
                                 cv = 3, scoring = 'roc_auc')
tpot_classifier.fit(X_train,y_train)

In [None]:
print("\n","-"*100,"\n")
print("Score:", tpot_classifier.score(X_test,y_test))

print("\n","-"*100,"\n")
print(" - Baseline ROC Area Under Curve of XGB:",roc_auc_score(y_test, tpot_classifier.predict_proba(X_test)[:,1]))
print("\n - Baseline Accuracy of XGB:",accuracy_score(y_test, tpot_classifier.predict(X_test)))
print("\n - Baseline Conf Matrix of XGB:\n",confusion_matrix(y_test, tpot_classifier.predict(X_test)))
print("\n - Baseline Classification Report of XGB:\n",classification_report(y_test, tpot_classifier.predict(X_test)))

# 6. Optuna Optimization

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
# dtrain = xgb.DMatrix(X_train, label=y_train)

#trial.suggest_categorical
#trial.suggest_float
#trial.suggest_int

# 1. Define an objective function to be maximized.
def objective(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    params = {
    'n_estimators' : trial.suggest_int('n_estimators',2000,3000),
    'max_depth':  trial.suggest_int('max_depth',3,8),
    'min_child_weight': trial.suggest_float('min_child_weight', 2,4),
    "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.2),
    'subsample': trial.suggest_float('subsample', 0.2, 1),
    'gamma': trial.suggest_float("gamma", 1e-4, 1.0),
    "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
    "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
    "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
    }
    
    xgbmodel_optuna = XGBClassifier(**params,random_state=seed,tree_method = "gpu_hist",eval_metric= "auc")
    xgbmodel_optuna.fit(X,y)
    cv = cross_val_score(xgbmodel_optuna, X, y, cv = 4,scoring='roc_auc').mean()
    return cv

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100,timeout=1200)

In [None]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
fig = optuna.visualization.plot_param_importances(study)
fig.show()