# Modeling - Hyperparameter tuning (Adv-ML)

    1. Gradient Boosting Trees
    2. XGBoost
    3. LightGBM
    4. CatBoost


In [21]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [18]:
train = pd.read_csv('../data/train_processed.csv', header=0)
test = pd.read_csv('../data/test_processed.csv', header=0)

In [19]:
failure_counts = train['Machine failure'].value_counts()

# Print the counts
print("Occurrences of Machine Failure:")
print("Value 0:", failure_counts[0])
print("Value 1:", failure_counts[1])

X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values
X_test = test

Occurrences of Machine Failure:
Value 0: 134281
Value 1: 2148


In [28]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [29]:
param_grid = { 
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


In [30]:
rfc = RandomForestClassifier()
rfcCV = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
rfcCV.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 50, 100, 200]})

In [31]:
grid_predictions = rfcCV.predict(X_val) 
rfcCV.best_params_
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     40296
           1       1.00      0.75      0.86       633

    accuracy                           1.00     40929
   macro avg       1.00      0.88      0.93     40929
weighted avg       1.00      1.00      1.00     40929

[[40294     2]
 [  157   476]]


In [32]:
### Gradient Boosting

In [35]:
param_grid = {
    'n_estimators': [25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt']
}


In [None]:
gbt = GradientBoostingClassifier()
gbt = GridSearchCV(estimator=gbt, param_grid=param_grid, cv= 3)
gbt.fit(X_train, y_train)

In [None]:
grid_predictions = gbt.predict(X_val) 
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))
print("Best Parameters for GBT:", gbt.best_params_)

In [None]:
### XGBoost

In [None]:
param_grid_xgb = {
    'min_child_weight': [40, 50, 60],
    'max_delta_step': [0, 1, 2],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [25, 50, 100],
}


In [None]:
xgb = XGBClassifier()
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3)
grid_search_xgb.fit(X_train, y_train)

In [None]:
best_params_xgb = grid_search_xgb.best_params_
grid_predictions_xgb = best_estimator_xgb.predict(X_val)
print(classification_report(y_val, grid_predictions_xgb, zero_division=0)) 
print(confusion_matrix(y_val, grid_predictions_xgb))
print("Best Parameters for XGBoost:", best_params_xgb)

In [None]:
### LightGBM

In [None]:
param_grid = {
    'min_child_weight': [40, 50, 60],
    'max_delta_step': [0, 1, 2],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [25, 50, 100],
}

In [None]:
light = lgb.LGBMClassifier()
light = GridSearchCV(mdl, param_grid, verbose=1, cv=3, n_jobs=-1)
light.fit(x_train, y_train)

In [None]:
grid_predictions = light.predict(X_val) 
print(classification_report(y_val, grid_predictions, zero_division=0)) 
print(confusion_matrix(y_val, grid_predictions))
print("Best Parameters for LGBM:", light.best_params_)