In [1]:
# Data wrangling
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
# Off FutureWarnings
import warnings 
warnings.filterwarnings('ignore')
#Resampling
from imblearn.over_sampling import SMOTENC 
from sklearn.utils import class_weight
#Dimension Reduction
from sklearn.decomposition import PCA
# Preprocessing
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, OneHotEncoder 
from sklearn.preprocessing import LabelEncoder
# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# Models Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
# Model evaluation
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
# Save model
import pickle

In [2]:
## Upload df
df = pd.read_csv('cleaned_data.csv') 
df

Unnamed: 0.1,Unnamed: 0,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Machine_failure,TWF,HDF,PWF,OSF,RNF,Failure_type
0,0,Medium,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,NF
1,1,Low,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,NF
2,2,Low,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,NF
3,3,Low,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,NF
4,4,Low,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,NF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9971,9995,Medium,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,NF
9972,9996,High,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,NF
9973,9997,Medium,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,NF
9974,9998,High,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,NF


In [19]:
from sklearn.metrics import f1_score 

def get_metrics(y_true, y_pred, unique_classes):
    # Calculating F1 scores for each class
    f1_scores_per_class = f1_score(y_true, y_pred, average=None, labels=unique_classes)
    recall_scores_per_class = recall_score(y_true, y_pred, average=None, labels=unique_classes)
    precision_scores_per_class = precision_score(y_true, y_pred, average=None, labels=unique_classes)
    class_f1_scores = dict(zip(unique_classes, f1_scores_per_class))
    class_recall_scores = dict(zip(unique_classes, recall_scores_per_class))
    class_precision_scores = dict(zip(unique_classes, precision_scores_per_class))
    dict_metrics = {
    'Accuracy': accuracy_score(y_true, y_pred),
    'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
    'Macro Recall': recall_score(y_true, y_pred, average='macro'), 
    'F1': f1_score(y_true, y_pred, average='weighted'),
    'Macro Precision': precision_score(y_true, y_pred, average='macro'), 
    'Macro F1': f1_score(y_true, y_pred, average='macro'),
    'F1 Scores per Class': class_f1_scores,
    'Recall Scores per Class': class_recall_scores,
    'Precision Scores per Class': class_precision_scores
    }
    return dict_metrics

In [4]:
NUMERIC_FEATURES = ['Air_temperature', 'Process_temperature', 'Rotational_speed', 'Torque', 'Tool_wear']
CATEGORIC_FEATURES = ['Type']

In [5]:
# Create preprocessor ColumnTransformer to do OneHotEncoder for CATEGORIC_FEATURES and StandardScaler() for NUMERIC_FEATURES
# Define the pipelines for numeric and categorical transformations
num_pipeline = Pipeline([
    ('num_features', StandardScaler()) 
    ])
cat_pipeline = Pipeline([ 
    ('cat_features', OneHotEncoder())
    ])
# Create the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num_trans', num_pipeline, NUMERIC_FEATURES),
    ('cat_trans', cat_pipeline, CATEGORIC_FEATURES) ])
# Fit and transform the data
df_transformed = preprocessor.fit_transform(df)
# Converting the transformed data back to a dataframe for easier visualization
# The transformed data will have new column names, especially for the one hot encoded categories 
encoded_feature_names = preprocessor.named_transformers_['cat_trans'].get_feature_names_out(CATEGORIC_FEATURES) 
new_column_names = list(NUMERIC_FEATURES) + list(encoded_feature_names)
df_transformed = pd.DataFrame(df_transformed, columns=new_column_names)
df_transformed.head()


Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium
0,-0.951551,-0.946692,0.065483,0.289789,-1.695147,0.0,0.0,1.0
1,-0.901538,-0.879314,-0.732576,0.643119,-1.647949,0.0,1.0,0.0
2,-0.951551,-1.014071,-0.230301,0.956069,-1.616484,0.0,1.0,0.0
3,-0.901538,-0.946692,-0.593055,-0.043351,-1.585019,0.0,1.0,0.0
4,-0.901538,-0.879314,-0.732576,0.007125,-1.553553,0.0,1.0,0.0


In [6]:
df_model = df.copy()
X = df_model[NUMERIC_FEATURES + CATEGORIC_FEATURES]
y = df['Failure_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [7]:
unique_classes = np.unique(y_train)
unique_classes

array(['HDF', 'NF', 'OSF', 'PWF', 'TWF'], dtype=object)

In [20]:
# Creating pipeline with PCA analysis and balanced class 
pip_model_no_pca = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(random_state=2023, max_depth=5, n_estimators=400))
    ])
# Fit pipeline with PCA
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
pip_model_no_pca.fit(X_train, y_train, model__sample_weight=weights)
# Generate Predictions using the correctly fitted pipeline 
y_pred = pip_model_no_pca.predict(X_test)
# Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)
# View Results
metrics

{'Accuracy': 0.9854709418837675,
 'Balanced Accuracy': 0.7170053525286083,
 'Macro Recall': 0.7170053525286083,
 'F1': 0.9842514682745832,
 'Macro Precision': 0.6718004549942349,
 'Macro F1': 0.6927313710294245,
 'F1 Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9925083957633687,
  'OSF': 0.7428571428571429,
  'PWF': 0.8235294117647058,
  'TWF': 0.0},
 'Recall Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.992764857881137,
  'OSF': 0.8125,
  'PWF': 0.875,
  'TWF': 0.0},
 'Precision Scores per Class': {'HDF': 0.9047619047619048,
  'NF': 0.9922520661157025,
  'OSF': 0.6842105263157895,
  'PWF': 0.7777777777777778,
  'TWF': 0.0}}

something is not right about TWF

In [28]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [24]:
cm

array([[  19,    2,    0,    0,    0],
       [   2, 1921,    6,    4,    2],
       [   0,    3,   13,    0,    0],
       [   0,    2,    0,   14,    0],
       [   0,    8,    0,    0,    0]])

In [22]:
pd.Series(y_test).value_counts()

Failure_type
NF     1935
HDF      21
PWF      16
OSF      16
TWF       8
Name: count, dtype: int64

In [None]:
pd.Series(y_test).value_counts()

In [14]:
TWF_df = df[df['Failure_type']=='TWF']
TWF_df.head()

Unnamed: 0.1,Unnamed: 0,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Machine_failure,TWF,HDF,PWF,OSF,RNF,Failure_type
76,77,Low,298.8,308.9,1455,41.3,208,1,1,0,0,0,0,TWF
1086,1087,High,296.9,307.8,1549,35.8,206,1,1,0,0,0,0,TWF
1506,1509,Low,298.0,308.5,1429,37.7,220,1,1,0,0,0,0,TWF
1679,1682,High,297.9,307.4,1604,36.1,225,1,1,0,0,0,0,TWF
1760,1763,Low,298.2,307.6,1511,31.0,209,1,1,0,0,0,0,TWF


In [15]:
TWF_pred = pip_model_no_pca.predict(TWF_df[NUMERIC_FEATURES + CATEGORIC_FEATURES])
pd.Series(TWF_pred).value_counts()

TWF    34
NF      8
Name: count, dtype: int64

In [31]:
temp_df = df[df['Failure_type']=='PWF']
temp_pred = pip_model_no_pca.predict(temp_df[NUMERIC_FEATURES + CATEGORIC_FEATURES])
pd.Series(temp_pred).value_counts()

PWF    78
NF      2
Name: count, dtype: int64

In [32]:
temp_df = df[df['Failure_type'] !='TWF']


In [34]:
X = temp_df[NUMERIC_FEATURES + CATEGORIC_FEATURES]
y = temp_df['Failure_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [35]:
# Creating pipeline with PCA analysis and balanced class 
pip_model_no_pca = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(random_state=2023, max_depth=5, n_estimators=400))
    ])
# Fit pipeline with PCA
weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
pip_model_no_pca.fit(X_train, y_train, model__sample_weight=weights)
# Generate Predictions using the correctly fitted pipeline 
y_pred = pip_model_no_pca.predict(X_test)
# Evaluate Metrics
metrics = get_metrics(y_test, y_pred, unique_classes)
# View Results
metrics

{'Accuracy': 0.9899345747357826,
 'Balanced Accuracy': 0.9001052592702024,
 'Macro Recall': 0.9001052592702024,
 'F1': 0.9903813513565508,
 'Macro Precision': 0.827756941495914,
 'Macro F1': 0.8583748231486659,
 'F1 Scores per Class': {'HDF': 0.8780487804878049,
  'NF': 0.9950789950789951,
  'OSF': 0.7368421052631579,
  'PWF': 0.8235294117647058,
  'TWF': 0.0},
 'Recall Scores per Class': {'HDF': 0.8571428571428571,
  'NF': 0.9932781799379524,
  'OSF': 0.875,
  'PWF': 0.875,
  'TWF': 0.0},
 'Precision Scores per Class': {'HDF': 0.9,
  'NF': 0.9968863518422418,
  'OSF': 0.6363636363636364,
  'PWF': 0.7777777777777778,
  'TWF': 0.0}}