In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import pickle
import matplotlib.pyplot as plt

In [35]:
# 1. Load the dataset
df = pd.read_csv('../artifacts/industrial_motor_sensor_data_8000.csv')

In [36]:
# 2. Drop duplicates if any
df = df.drop_duplicates()
print(f"Dataset size after removing duplicates: {df.shape}")

Dataset size after removing duplicates: (8000, 5)


In [37]:
# 3. Features and Label
X = df.drop("Label", axis=1)
y = df["Label"]

In [38]:
# 4. Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [39]:
# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)


In [40]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [41]:
#Handle imbalance (if needed)
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [42]:
#  Grid Search for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [43]:
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train_smote, y_train_smote)

In [44]:
# Best model training
best_params_rf = grid_search_rf.best_params_
print(f"Best parameters for Random Forest: {best_params_rf}")

Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}


In [45]:
rf = RandomForestClassifier(**best_params_rf)
rf.fit(X_train_smote, y_train_smote)

In [46]:
#Evaluation Functions
def training_scores(y_true, y_pred):
    print("\n📊 Training Scores:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.3f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.3f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.3f}")

def testing_scores(y_true, y_pred):
    print("\n📊 Testing Scores:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.3f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.3f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.3f}")


In [47]:
#Evaluate
y_train_pred = rf.predict(X_train_smote)
y_test_pred = rf.predict(X_test)

training_scores(y_train_smote, y_train_pred)
testing_scores(y_test, y_test_pred)



📊 Training Scores:
Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F1 Score: 1.000

📊 Testing Scores:
Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F1 Score: 1.000


In [48]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_smote, y_train_smote)
training_scores(y_train_smote, lr.predict(X_train_smote))
testing_scores(y_test, lr.predict(X_test))


📊 Training Scores:
Accuracy: 0.888
Precision: 0.888
Recall: 0.888
F1 Score: 0.888

📊 Testing Scores:
Accuracy: 0.902
Precision: 0.903
Recall: 0.902
F1 Score: 0.903


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
dt = DecisionTreeClassifier()
dt.fit(X_train_smote, y_train_smote)
training_scores(y_train_smote, dt.predict(X_train_smote))
testing_scores(y_test, dt.predict(X_test))


📊 Training Scores:
Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F1 Score: 1.000

📊 Testing Scores:
Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F1 Score: 1.000


In [50]:
print("\n🔹 Support Vector Machine:")
svm = SVC()
svm.fit(X_train_smote, y_train_smote)
training_scores(y_train_smote, svm.predict(X_train_smote))
testing_scores(y_test, svm.predict(X_test))


🔹 Support Vector Machine:

📊 Training Scores:
Accuracy: 0.869
Precision: 0.871
Recall: 0.869
F1 Score: 0.869

📊 Testing Scores:
Accuracy: 0.886
Precision: 0.888
Recall: 0.886
F1 Score: 0.886


In [51]:
print("\n🔹 Gaussian Naive Bayes:")
nb = GaussianNB()
nb.fit(X_train_smote, y_train_smote)
training_scores(y_train_smote, nb.predict(X_train_smote))
testing_scores(y_test, nb.predict(X_test))


🔹 Gaussian Naive Bayes:

📊 Training Scores:
Accuracy: 0.956
Precision: 0.956
Recall: 0.956
F1 Score: 0.956

📊 Testing Scores:
Accuracy: 0.957
Precision: 0.957
Recall: 0.957
F1 Score: 0.957


## save the model

In [52]:
#Save the best-performing model 
with open('../models/machine_failure_model.pickle', 'wb') as file:
    pickle.dump(nb, file)

with open('../models/machine_failure_label_encoder.pickle', 'wb') as file:
    pickle.dump(le, file)    