# Modeling - Hyperparameter tuning (Adv-ML)

    1. Gradient Boosting Trees
    2. XGBoost
    3. LightGBM
    4. CatBoost


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [5]:
train = pd.read_csv('../data/train_processed.csv', header=0)
test = pd.read_csv('../data/test_processed.csv', header=0)

In [6]:
failure_counts = train['Machine failure'].value_counts()

# Print the counts
print("Occurrences of Machine Failure:")
print("Value 0:", failure_counts[0])
print("Value 1:", failure_counts[1])

X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values
X_test = test

Occurrences of Machine Failure:
Value 0: 134281
Value 1: 2148


In [7]:
### Baseline
train

Unnamed: 0,Air temperature,Process temperature,Rotational speed,Torque,Machine failure,Type_H,Type_L,Type_M,TWF_0,TWF_1,...,Tool wear _237,Tool wear _238,Tool wear _239,Tool wear _240,Tool wear _241,Tool wear _242,Tool wear _244,Tool wear _246,Tool wear _251,Tool wear _253
0,300.6,309.6,1596,36.1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,302.6,312.1,1759,29.1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,299.3,308.5,1805,26.5,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,301.0,310.9,1524,44.3,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,298.0,309.0,1641,35.4,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136424,300.1,311.4,1530,37.5,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
136425,297.5,308.5,1447,49.1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
136426,300.5,311.8,1524,38.5,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
136427,301.7,310.9,1447,46.3,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
param_grid = { 
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


In [15]:
rfc = RandomForestClassifier()
rfcCV = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
rfcCV.fit(X_train, y_train)
grid_predictions = rfcCV.predict(X_val) 
rfcCV.best_params_
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     40296
           1       1.00      0.75      0.85       633

    accuracy                           1.00     40929
   macro avg       1.00      0.87      0.93     40929
weighted avg       1.00      1.00      1.00     40929

[[40294     2]
 [  160   473]]


In [17]:
### Gradient Boosting

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt']
}


In [None]:
gbt = GradientBoostingClassifier()
gbt = GridSearchCV(estimator=gbt, param_grid=param_grid, cv= 3)
gbt.fit(X_train, y_train)
grid_predictions = gbt.predict(X_val) 
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))
print("Best Parameters for GBT:", gbt.best_params_)

In [None]:
### XGBoost

In [11]:
param_grid_xgb = {
    'min_child_weight': [40, 50, 60, 70, 80],
    'max_delta_step': [0, 1, 2],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [100, 200, 300],
}


In [None]:
xgb = XGBClassifier()
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3)
grid_search_xgb.fit(X_train, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_estimator_xgb = grid_search_xgb.best_estimator_
grid_predictions_xgb = best_estimator_xgb.predict(X_val)
print(classification_report(y_val, grid_predictions_xgb, zero_division=0)) 
print(confusion_matrix(y_val, grid_predictions_xgb))
print("Best Parameters for XGBoost:", best_params_xgb)

In [None]:
### LightGBM

In [None]:
param_grid = {
    'min_child_weight': [40, 50, 60, 70, 80],
    'max_delta_step': [0, 1, 2],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [100, 200, 300],
}

In [None]:
light = lgb.LGBMClassifier()
light = GridSearchCV(mdl, param_grid, verbose=1, cv=3, n_jobs=-1)
light.fit(x_train, y_train)
grid_predictions = light.predict(X_val) 
print(classification_report(y_val, grid_predictions, zero_division=0)) 
print(confusion_matrix(y_val, grid_predictions))
print("Best Parameters for LGBM:", light.best_params_)