# Modeling - Hyperparameter tuning (Adv-ML)

    1. Gradient Boosting Trees
    2. XGBoost
    3. LightGBM
    4. CatBoost


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The pgf.debug rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.level rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Users\Mirna Elizondo\anaconda3\envs\condaEnv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.fileo rcparam was depr

In [2]:
train = pd.read_csv('../data/train_processed.csv', header=0)
test = pd.read_csv('../data/test_processed.csv', header=0)

In [3]:
failure_counts = train['Machine failure'].value_counts()

# Print the counts
print("Occurrences of Machine Failure:")
print("Value 0:", failure_counts[0])
print("Value 1:", failure_counts[1])

X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values
X_test = test

Occurrences of Machine Failure:
Value 0: 134281
Value 1: 2148


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
param_dist_rfc = { 
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

rfc = RandomForestClassifier()
random_search_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist_rfc, cv=3)
random_search_rfc.fit(X_train_pca, y_train)

rfc_predictions = random_search_rfc.predict(X_test_pca) 
print(classification_report(y_val, rfc_predictions)) 
print(confusion_matrix(y_val, rfc_predictions))
print("Best Parameters for RFC:", random_search_rfc.best_params_)

In [None]:
report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Model' : 'RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

metrics_df = pd.DataFrame(data)
print(metrics_df)

#metrics_df.to_csv('../data/pcaResults.csv', header=1)

In [None]:
### Gradient Boosting

In [None]:
param_dist_gbt = {
    'n_estimators': [25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt']
}

gbt = GradientBoostingClassifier()
random_search_gbt = RandomizedSearchCV(estimator=gbt, param_distributions=param_dist_gbt, cv=3)
random_search_gbt.fit(X_train_pca, y_train)

gbt_predictions = random_search_gbt.predict(X_test_pca) 
print(classification_report(y_val, grid_predictions)) 
print(confusion_matrix(y_val, grid_predictions))
print("Best Parameters for GBT:", random_search_gbt.best_params_)

In [None]:
report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Model' : 'GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

metrics_df = pd.DataFrame(data)
print(metrics_df)

#metrics_df.to_csv('../data/pcaResults.csv', header=1)

In [None]:
### XGBoost

In [None]:
param_dist_xgb = {
    'min_child_weight': [40, 50, 60],
    'max_delta_step': [0, 1, 2],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [25, 50, 100],
    'Best Params':random_search_gbt.best_params_
}

xgb = XGBClassifier()
random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist_xgb, cv=3)
random_search_xgb.fit(X_train_pca, y_train)

grid_predictions_xgb = random_search_xgb.predict(X_test_pca)
print(classification_report(y_val, grid_predictions_xgb, zero_division=0)) 
print(confusion_matrix(y_val, grid_predictions_xgb))
print("Best Parameters for XGBoost:", random_search_xgb.best_params_)

In [None]:
report = classification_report(y_val, grid_predictions_xgb, output_dict=True)
conf_matrix = confusion_matrix(y_val, grid_predictions_xgb)
params = random_search_xgb.best_params_
# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Model' : 'XGBoost',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix],
    'Best Params':random_search_xgb.best_params_


df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

#metrics_df.to_csv('../data/pcaResults.csv', header=1)

In [None]:
### LightGBM

In [None]:
param_dist_light = {
    'min_child_weight': [40, 50, 60],
    'max_delta_step': [0, 1, 2],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'n_estimators': [25, 50, 100],
}

lgbm = lgb.LGBMClassifier()
random_search_lgbm = RandomizedSearchCV(lgbm, param_distributions=param_dist_light, verbose=1, cv=3, n_jobs=-1)
random_search_lgbm.fit(X_train_pca, y_train)

grid_predictions_lgbm = random_search_lgbm.predict(X_test_pca) 
print(classification_report(y_val, grid_predictions_lgbm, zero_division=0)) 
print(confusion_matrix(y_val, grid_predictions_lgbm))
print("Best Parameters for LGBM:", random_search_lgbm.best_params_)

In [None]:
report = classification_report(y_val, grid_predictions_xgb, output_dict=True)
conf_matrix = confusion_matrix(y_val, grid_predictions_xgb)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Model' : 'LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix],
    'Best Params':random_search_lgbm.best_params_
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

metrics_df.to_csv('../data/tuningResults.csv', header=1)