## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [147]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
import warnings

#### Import the CSV Data as Pandas DataFrame

In [148]:
df = pd.read_csv('/mnt/d/ml_projects/maintenance/notebook/data/predictive_maintenance.csv')

#### Show Top 5 Records

In [149]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


#### Preparing X and Y variables

In [150]:
# Drop ID columns
X = df.drop(columns=['UDI','Product ID', 'Target', 'Failure Type'], axis=1)

In [151]:
X.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,M,298.1,308.6,1551,42.8,0
1,L,298.2,308.7,1408,46.3,3
2,L,298.1,308.5,1498,49.4,5
3,L,298.2,308.6,1433,39.5,7
4,L,298.2,308.7,1408,40.0,9


In [152]:
y = df['Failure Type']

In [153]:
y

0       No Failure
1       No Failure
2       No Failure
3       No Failure
4       No Failure
           ...    
9995    No Failure
9996    No Failure
9997    No Failure
9998    No Failure
9999    No Failure
Name: Failure Type, Length: 10000, dtype: object

In [154]:
# Identify numerical and categorical features
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Initialize encoders (but do not modify the original X)
scaler = StandardScaler()
label_encoders = {col: LabelEncoder() for col in cat_features}  # {'Gender': LabelEncoder(), 'City': LabelEncoder()}

X_scaled = scaler.fit_transform(X[num_features])  # Check standardization
X_encoded = {col: label_encoders[col].fit_transform(X[col]) for col in cat_features}  # Check label encoding

# Print dry run results
print("Standard Scaler Output (Numerical Features):\n", X_scaled)
print("\nLabel Encoded Output (Categorical Features):\n", X_encoded)


Standard Scaler Output (Numerical Features):
 [[-0.95238944 -0.94735989  0.06818514  0.28219976 -1.69598374]
 [-0.90239341 -0.879959   -0.72947151  0.63330802 -1.6488517 ]
 [-0.95238944 -1.01476077 -0.22744984  0.94428963 -1.61743034]
 ...
 [-0.50242514 -0.94735989  0.59251888 -0.66077672 -1.35034876]
 [-0.50242514 -0.879959   -0.72947151  0.85400464 -1.30321671]
 [-0.50242514 -0.879959   -0.2162938   0.02137647 -1.22466331]]

Label Encoded Output (Categorical Features):
 {'Type': array([2, 1, 1, ..., 2, 0, 2])}


In [178]:
# Combine the scaled numerical features and encoded categorical features
X_scaled_df = pd.DataFrame(X_scaled, columns=num_features)  # Convert X_scaled to DataFrame
X_encoded_df = pd.DataFrame(X_encoded)  # Convert X_encoded to DataFrame (it will automatically use the column names)
print(X_encoded_df)
# Concatenating X_scaled_df and X_encoded_df
X_transformed = pd.concat([X_scaled_df, X_encoded_df], axis=1)

      Type
0        2
1        1
2        1
3        1
4        1
...    ...
9995     2
9996     0
9997     2
9998     0
9999     2

[10000 rows x 1 columns]


In [156]:
X_transformed

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type
0,-0.952389,-0.947360,0.068185,0.282200,-1.695984,2
1,-0.902393,-0.879959,-0.729472,0.633308,-1.648852,1
2,-0.952389,-1.014761,-0.227450,0.944290,-1.617430,1
3,-0.902393,-0.947360,-0.590021,-0.048845,-1.586009,1
4,-0.902393,-0.879959,-0.729472,0.001313,-1.554588,1
...,...,...,...,...,...,...
9995,-0.602417,-1.082162,0.363820,-1.052012,-1.476034,2
9996,-0.552421,-1.082162,0.520005,-0.821283,-1.428902,0
9997,-0.502425,-0.947360,0.592519,-0.660777,-1.350349,2
9998,-0.502425,-0.879959,-0.729472,0.854005,-1.303217,0


In [157]:
label_encoder_y= LabelEncoder()
y = label_encoder_y.fit_transform(y)
y_transformed = pd.Series(y, name="Failure_Type")
y_transformed

0       1
1       1
2       1
3       1
4       1
       ..
9995    1
9996    1
9997    1
9998    1
9999    1
Name: Failure_Type, Length: 10000, dtype: int64

In [158]:
# Separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed,y_transformed,test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 6), (2000, 6), (8000,), (2000,))

In [159]:
X_train

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type
9254,-0.852397,-0.610355,0.430756,-0.891505,1.367599,1
1561,-0.902393,-1.082162,-0.841032,1.385683,0.456380,1
1670,-0.902393,-1.486567,-0.060109,-0.891505,1.351888,1
6087,0.447499,0.535460,0.335930,-0.700903,-1.586009,2
6669,0.697480,0.333257,0.179746,-0.610618,1.571838,1
...,...,...,...,...,...,...
5734,1.147444,1.209468,-0.947014,1.626443,1.571838,1
5191,1.997377,2.153081,-0.684847,0.603213,0.314983,1
5390,1.397424,1.546473,-0.311120,0.723593,1.807498,0
860,-1.952310,-2.093175,0.012405,-0.741030,-1.177531,0


In [160]:
# Replace special characters in column names
X_train.columns = X_train.columns.str.replace(r'[^\w\s]', '', regex=True)
X_test.columns = X_test.columns.str.replace(r'[^\w\s]', '', regex=True)

In [161]:
X_train

Unnamed: 0,Air temperature K,Process temperature K,Rotational speed rpm,Torque Nm,Tool wear min,Type
9254,-0.852397,-0.610355,0.430756,-0.891505,1.367599,1
1561,-0.902393,-1.082162,-0.841032,1.385683,0.456380,1
1670,-0.902393,-1.486567,-0.060109,-0.891505,1.351888,1
6087,0.447499,0.535460,0.335930,-0.700903,-1.586009,2
6669,0.697480,0.333257,0.179746,-0.610618,1.571838,1
...,...,...,...,...,...,...
5734,1.147444,1.209468,-0.947014,1.626443,1.571838,1
5191,1.997377,2.153081,-0.684847,0.603213,0.314983,1
5390,1.397424,1.546473,-0.311120,0.723593,1.807498,0
860,-1.952310,-2.093175,0.012405,-0.741030,-1.177531,0


In [162]:
from imblearn.combine import SMOTETomek

# Initialize SMOTETomek
smote_tomek_failure = SMOTETomek(random_state=42)

# Apply SMOTETomek resampling
X_train_resampled, y_train_resampled = smote_tomek_failure.fit_resample(X_train, y_train)

# Check the shapes of the resampled data
print("Before resampling:", X_train.shape, y_train.shape)
print("After resampling:", X_train_resampled.shape, y_train_resampled.shape)

Before resampling: (8000, 6) (8000,)
After resampling: (46302, 6) (46302,)


In [163]:
X_train_resampled["Air temperature K"].value_counts()

Air temperature K
 0.347507    429
 1.197440    363
-0.452429    304
 0.197519    265
-0.502425    255
            ... 
 0.220060      1
-0.858026      1
-1.001723      1
-2.090464      1
 0.462380      1
Name: count, Length: 36134, dtype: int64

In [164]:
y_train_resampled.value_counts()

Failure_Type
1    7717
3    7717
0    7717
5    7717
2    7717
4    7717
Name: count, dtype: int64

In [165]:
# Check for duplicates in the resampled data
duplicates = X_train_resampled.duplicated().sum()
print(f"Number of duplicates: {duplicates}")


Number of duplicates: 0


#### Create an Evaluate Function to give all metrics after model Training

In [166]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average="weighted", zero_division=1)  # Adjust average as needed
    recall = recall_score(true, predicted, average="weighted", zero_division=1)
    F1_score = f1_score(true, predicted, average="weighted", zero_division=1)

    metrics = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": F1_score,
    }
    return metrics

In [167]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(objective="multi:softmax", num_class=6, random_state=42), 
}
model_list = []
f1_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    train_metrics = evaluate_model(y_train, y_train_pred)
    test_metrics = evaluate_model(y_test, y_test_pred)

    # Extract metrics for train and test datasets
    model_train_accuracy = train_metrics['Accuracy']
    model_train_precision = train_metrics['Precision']
    model_train_recall = train_metrics['Recall']
    model_train_f1 = train_metrics['F1 Score']
    
    model_test_accuracy = test_metrics['Accuracy']
    model_test_precision = test_metrics['Precision']
    model_test_recall = test_metrics['Recall']
    model_test_f1 = test_metrics['F1 Score']
 
    # print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print("- Precision {:.4f}".format(model_train_precision))
    print("- Recall: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_f1))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print("- Precision {:.4f}".format(model_test_precision))
    print("- Recall: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_f1))
    f1_list.append(model_test_f1)

    print('='*35)
    print('\n')
# print(model_list)
# print(f1_list)

Model performance for Training set
- Accuracy: 0.9785
- Precision 0.9768
- Recall: 0.9785
- F1 Score: 0.9734
----------------------------------
Model performance for Test set
- Accuracy: 0.9765
- Precision 0.9736
- Recall: 0.9765
- F1 Score: 0.9701


Model performance for Training set
- Accuracy: 0.9735
- Precision 0.9721
- Recall: 0.9735
- F1 Score: 0.9651
----------------------------------
Model performance for Test set
- Accuracy: 0.9740
- Precision 0.9727
- Recall: 0.9740
- F1 Score: 0.9638


Model performance for Training set
- Accuracy: 0.9999
- Precision 0.9999
- Recall: 0.9999
- F1 Score: 0.9999
----------------------------------
Model performance for Test set
- Accuracy: 0.9815
- Precision 0.9812
- Recall: 0.9815
- F1 Score: 0.9769


Model performance for Training set
- Accuracy: 1.0000
- Precision 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9820
- Precision 0.9771
- Recall: 0.9820
- F1 Score: 0.977

### Results

#### Selecting the best model based on f1-score

In [168]:
pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'F1_Score']).sort_values(by=["F1_Score"],ascending=False)

Unnamed: 0,Model Name,F1_Score
3,XGBClassifier,0.977848
2,RandomForestClassifier,0.976881
0,Logistic Regression,0.97008
1,SVC,0.963807


## Plot y_pred and y_test

#### Comparison between Actual and Predicted Values

In [177]:
pred_df = pd.DataFrame({
    'Actual Value': y_test,  # True labels from the test set
    'Predicted Value': y_test_pred,  # Predicted labels from the model
    'Correct Prediction': (y_test == y_test_pred)  # Boolean indicating whether the prediction was correct or not
})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Correct Prediction
6252,1,1,True
4684,1,1,True
1731,1,1,True
4742,1,1,True
4521,1,1,True
...,...,...,...
6412,1,1,True
8285,1,1,True
7853,1,1,True
1095,3,3,True
