In [1]:
# Data wrangling
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
# Off FutureWarnings
import warnings 
warnings.filterwarnings('ignore')
#Resampling
from imblearn.over_sampling import SMOTENC 
from sklearn.utils import class_weight
#Dimension Reduction
from sklearn.decomposition import PCA
# Preprocessing
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, OneHotEncoder 
from sklearn.preprocessing import LabelEncoder
# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# Models Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
# Model evaluation
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
# Save model
import pickle

In [2]:
## Upload df
df = pd.read_csv('cleaned_data.csv') 
df

Unnamed: 0.1,Unnamed: 0,Type,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Machine_failure,TWF,HDF,PWF,OSF,RNF,Failure_type
0,0,Medium,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,NF
1,1,Low,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,NF
2,2,Low,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,NF
3,3,Low,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,NF
4,4,Low,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,NF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9971,9995,Medium,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,NF
9972,9996,High,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,NF
9973,9997,Medium,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,NF
9974,9998,High,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,NF


In [6]:
from sklearn.metrics import f1_score 
def get_metrics(y_true, y_pred):
    # Calculating F1 scores for each class
    f1_scores_per_class = f1_score(y_true, y_pred, average=None)
    dict_metrics = {
    'Accuracy': accuracy_score(y_true, y_pred),
    'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
    'Macro Recall': recall_score(y_true, y_pred, average='macro'), 'Macro Precision': precision_score(y_true, y_pred, average='macro'), 'Macro F1': f1_score(y_true, y_pred, average='macro'),
    'F1 Scores per Class': f1_scores_per_class
    }
    return dict_metrics

In [4]:
NUMERIC_FEATURES = ['Air_temperature', 'Process_temperature', 'Rotational_speed', 'Torque', 'Tool_wear']
CATEGORIC_FEATURES = ['Type']

In [5]:
# Create preprocessor ColumnTransformer to do OneHotEncoder for CATEGORIC_FEATURES and StandardScaler() for NUMERIC_FEATURES
# Define the pipelines for numeric and categorical transformations
num_pipeline = Pipeline([
    ('num_features', StandardScaler()) 
    ])
cat_pipeline = Pipeline([ 
    ('cat_features', OneHotEncoder())
    ])
# Create the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num_trans', num_pipeline, NUMERIC_FEATURES),
    ('cat_trans', cat_pipeline, CATEGORIC_FEATURES) ])
# Fit and transform the data
df_transformed = preprocessor.fit_transform(df)
# Converting the transformed data back to a dataframe for easier visualization
# The transformed data will have new column names, especially for the one hot encoded categories 
encoded_feature_names = preprocessor.named_transformers_['cat_trans'].get_feature_names_out(CATEGORIC_FEATURES) 
new_column_names = list(NUMERIC_FEATURES) + list(encoded_feature_names)
df_transformed = pd.DataFrame(df_transformed, columns=new_column_names)
df_transformed.head()

Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Type_High,Type_Low,Type_Medium
0,-0.951551,-0.946692,0.065483,0.289789,-1.695147,0.0,0.0,1.0
1,-0.901538,-0.879314,-0.732576,0.643119,-1.647949,0.0,1.0,0.0
2,-0.951551,-1.014071,-0.230301,0.956069,-1.616484,0.0,1.0,0.0
3,-0.901538,-0.946692,-0.593055,-0.043351,-1.585019,0.0,1.0,0.0
4,-0.901538,-0.879314,-0.732576,0.007125,-1.553553,0.0,1.0,0.0


In [7]:
df_model = df.copy()
X = df_model[NUMERIC_FEATURES + CATEGORIC_FEATURES]
y = df_model['Failure_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [8]:
from sklearn.neural_network import MLPClassifier

In [21]:
# Creating pipeline with PCA analysis and balanced class 
pip_model_no_pca = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MLPClassifier(random_state=2023))
    ])
# Fit pipeline with PCA
# weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
pip_model_no_pca.fit(X_train, y_train)
# Generate Predictions using the correctly fitted pipeline 
y_pred = pip_model_no_pca.predict(X_test)
# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)
# View Results
metrics

{'Accuracy': 0.9879759519038076,
 'Balanced Accuracy': 0.7204983388704319,
 'Macro Recall': 0.7204983388704319,
 'Macro Precision': 0.701796650935209,
 'Macro F1': 0.7109004400692067,
 'F1 Scores per Class': array([0.8372093 , 0.99380805, 0.875     , 0.84848485, 0.        ])}

In [20]:
# Get categorical feature indices for SMOTENC
categorical_features_indices = [i for i, feature in enumerate(NUMERIC_FEATURES + CATEGORIC_FEATURES) if feature in CATEGORIC_FEATURES]

# Creating the new pipeline with SMOTENC using ImbPipeline 
pip_model_smotenc = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smotenc', SMOTENC(categorical_features=categorical_features_indices, random_state=42)), 
    ('model', MLPClassifier(random_state=2023))
])

# Fit the pipeline 
pip_model_smotenc.fit(X_train, y_train)
# Generate Predictions
y_pred = pip_model_smotenc.predict(X_test)
# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)
# View Results
metrics

{'Accuracy': 0.9724448897795591,
 'Balanced Accuracy': 0.7953488372093023,
 'Macro Recall': 0.7953488372093023,
 'Macro Precision': 0.6371174420749973,
 'Macro F1': 0.6973028938133624,
 'F1 Scores per Class': array([0.80769231, 0.98565841, 0.76470588, 0.81081081, 0.11764706])}

In [24]:
# fine_tune_params = {
#     'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
#     'activation': ['tanh', 'relu'],
#     'solver': ['sgd', 'adam'],
#     'alpha': [0.0001, 0.05],
#     'learning_rate': ['constant','adaptive']
# }
fine_tune_params = {
    'activation': ['tanh', 'relu']
}

# Running a new GridSearchCV for fine-tuning
fine_tune_grid = GridSearchCV(pip_model_no_pca, fine_tune_params, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
fine_tune_grid.fit(X_train, y_train)

# Collecting and printing the fine-tuned results
fine_tuned_results = pd.DataFrame(fine_tune_grid.cv_results_) 
fine_tuned_best_index = fine_tuned_results['mean_test_score'].idxmax()
fine_tuned_best_params = fine_tuned_results.loc[fine_tuned_best_index, 'params']

# Print best model parameters
print("Best fine-tuned model parameters:") 
print(fine_tuned_best_params)
# Finding the best estimator paramaters 
tuned_model = fine_tune_grid.best_estimator_ 
y_pred = tuned_model.predict(X_test)
# View new perfomance (focus on F1-score) 
get_metrics(y_test, y_pred)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


ValueError: Invalid parameter 'activation' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_trans',
                                                  Pipeline(steps=[('num_features',
                                                                   StandardScaler())]),
                                                  ['Air_temperature',
                                                   'Process_temperature',
                                                   'Rotational_speed', 'Torque',
                                                   'Tool_wear']),
                                                 ('cat_trans',
                                                  Pipeline(steps=[('cat_features',
                                                                   OneHotEncoder())]),
                                                  ['Type'])])),
                ('model', MLPClassifier(random_state=2023))]). Valid parameters are: ['memory', 'steps', 'verbose'].