In [43]:
!pip install scikit-learn==1.3.0




In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [45]:
df = pd.read_csv('predictive_maintenance.csv')

In [46]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,51,L47230,L,298.9,309.1,2861,4.6,143,1,Power Failure
1,70,L47249,L,298.9,309.0,1410,65.7,191,1,Power Failure
2,78,L47257,L,298.8,308.9,1455,41.3,208,1,Tool Wear Failure
3,161,L47340,L,298.4,308.2,1282,60.7,216,1,Overstrain Failure
4,162,L47341,L,298.3,308.1,1412,52.3,218,1,Overstrain Failure


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9998 entries, 0 to 9997
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      9998 non-null   int64  
 1   Product ID               9998 non-null   object 
 2   Type                     9998 non-null   object 
 3   Air temperature [K]      9998 non-null   float64
 4   Process temperature [K]  9998 non-null   float64
 5   Rotational speed [rpm]   9998 non-null   int64  
 6   Torque [Nm]              9998 non-null   float64
 7   Tool wear [min]          9998 non-null   int64  
 8   Target                   9998 non-null   int64  
 9   Failure Type             9998 non-null   object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.2+ KB


In [48]:
df['Type'].nunique()

3

In [49]:
df.isnull().sum()

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
dtype: int64

#Some Understandings : <br>
1. First we need to transform some columns by ordinal encoding/ label encoding<br>
2. There are no missing values<br>
3. `Type` Column has 3 types - L/M/H which is ordinal data , so apply ordinal encoding here<br>
The target variable here is `Target` only, i.e. to predict failure or not
4. Notice that, there are two output target variables `Target` and `Failure Type`<br> So prediect here multiclass Failure Type Variable<br>

5. This is a classification problem <br>

In [50]:
df[df['Target']==1]

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,51,L47230,L,298.9,309.1,2861,4.6,143,1,Power Failure
1,70,L47249,L,298.9,309.0,1410,65.7,191,1,Power Failure
2,78,L47257,L,298.8,308.9,1455,41.3,208,1,Tool Wear Failure
3,161,L47340,L,298.4,308.2,1282,60.7,216,1,Overstrain Failure
4,162,L47341,L,298.3,308.1,1412,52.3,218,1,Overstrain Failure
...,...,...,...,...,...,...,...,...,...,...
3753,2656,L56938,L,298.6,309.8,2271,16.2,218,1,Tool Wear Failure
3754,2657,L56944,L,298.5,309.5,1294,66.7,12,1,Power Failure
3755,2658,L57002,L,298.5,309.4,1360,60.9,187,1,Overstrain Failure
3756,2659,L57010,L,298.3,309.3,1337,56.1,206,1,Overstrain Failure


In [51]:
X = df.iloc[:,2:8]
y = df.iloc[:,-1]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
y_train.shape

(7998,)

In [52]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['L', 'M', 'H']])
oe.fit(X_train[['Type']])
X_train['Type'] = oe.transform(X_train[['Type']]).astype(int)
X_test['Type'] = oe.transform(X_test[['Type']]).astype(int)

In [53]:
X_train.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
8964,0,302.5,311.9,1401,44.9,203
1561,1,303.2,311.5,1336,58.2,2
1670,0,297.3,308.1,1615,35.4,217
4345,1,297.9,309.8,1693,29.9,35
8099,0,303.2,311.2,1427,45.6,25


In [54]:
from sklearn.preprocessing import LabelEncoder
categories = ['No Failure', 'Heat Dissipation Failure', 'Power Failure', 'Overstrain Failure', 'Tool Wear Failure', 'Random Failures']
custom_encoder = {cat: i for i, cat in enumerate(categories)}
y_train_encoded = [custom_encoder.get(cat, len(categories)) for cat in y_train]
y_test_encoded = [custom_encoder.get(cat, len(categories)) for cat in y_test]
le = LabelEncoder()
le.fit(list(custom_encoder.values()))
y_train = le.transform(y_train_encoded)
y_test = le.transform(y_test_encoded)



In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

clf = LogisticRegression(solver='lbfgs', max_iter=10000)
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)

log_train = round(clf.score(X_train, y_train) * 100, 2)
log_accuracy = round(accuracy_score(y_pred, y_test) * 100, 2)


print("Training Accuracy    :",log_train ,"%")
print("Model Accuracy Score :",log_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Classification_Report: \n",classification_report(y_test,y_pred))
print("\033[1m--------------------------------------------------------\033[0m")


Training Accuracy    : 91.44 %
Model Accuracy Score : 92.8 %
[1m--------------------------------------------------------[0m
Classification_Report: 
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      1251
           1       0.88      0.88      0.88       225
           2       0.96      0.96      0.96       217
           3       0.91      0.96      0.93       218
           4       0.62      0.54      0.58        87
           5       0.00      0.00      0.00         2

    accuracy                           0.93      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.93      0.93      0.93      2000

[1m--------------------------------------------------------[0m


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
decision.fit(X_train, y_train)
y_pred_dec = decision.predict(X_test)

decision_train = round(decision.score(X_train, y_train) * 100, 2)
decision_accuracy = round(accuracy_score(y_pred_dec, y_test) * 100, 2)

print("Training Accuracy    :",decision_train ,"%")
print("Model Accuracy Score :",decision_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Classification_Report: \n",classification_report(y_test,y_pred_dec))
print("\033[1m--------------------------------------------------------\033[0m")


Training Accuracy    : 100.0 %
Model Accuracy Score : 98.6 %
[1m--------------------------------------------------------[0m
Classification_Report: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      1251
           1       0.98      1.00      0.99       225
           2       1.00      1.00      1.00       217
           3       0.99      1.00      0.99       218
           4       0.88      1.00      0.94        87
           5       0.00      0.00      0.00         2

    accuracy                           0.99      2000
   macro avg       0.81      0.83      0.82      2000
weighted avg       0.99      0.99      0.99      2000

[1m--------------------------------------------------------[0m


In [57]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
random_forest.score(X_train, y_train)

random_forest_train = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_accuracy = round(accuracy_score(y_pred_rf, y_test) * 100, 2)

print("Training Accuracy    :",random_forest_train ,"%")
print("Model Accuracy Score :",random_forest_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Classification_Report: \n",classification_report(y_test,y_pred_rf))
print("\033[1m--------------------------------------------------------\033[0m")

Training Accuracy    : 100.0 %
Model Accuracy Score : 99.5 %
[1m--------------------------------------------------------[0m
Classification_Report: 
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      1251
           1       0.98      1.00      0.99       225
           2       1.00      1.00      1.00       217
           3       0.99      1.00      1.00       218
           4       0.98      1.00      0.99        87
           5       0.00      0.00      0.00         2

    accuracy                           0.99      2000
   macro avg       0.82      0.83      0.83      2000
weighted avg       0.99      0.99      0.99      2000

[1m--------------------------------------------------------[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
# Support Vector Machines
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

svc_train = round(svc.score(X_train, y_train) * 100, 2)
svc_accuracy = round(accuracy_score(y_pred_svc, y_test) * 100, 2)

print("Training Accuracy    :",svc_train ,"%")
print("Model Accuracy Score :",svc_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Classification_Report: \n",classification_report(y_test,y_pred_svc))
print("\033[1m--------------------------------------------------------\033[0m")

Training Accuracy    : 74.64 %
Model Accuracy Score : 74.3 %
[1m--------------------------------------------------------[0m
Classification_Report: 
               precision    recall  f1-score   support

           0       0.79      0.94      0.86      1251
           1       0.40      0.25      0.31       225
           2       0.98      0.27      0.43       217
           3       0.62      0.88      0.73       218
           4       0.00      0.00      0.00        87
           5       0.00      0.00      0.00         2

    accuracy                           0.74      2000
   macro avg       0.47      0.39      0.39      2000
weighted avg       0.71      0.74      0.70      2000

[1m--------------------------------------------------------[0m


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##Random Forest is having 99.99% acc and 98% prec. So lets use that here

In [62]:
import pickle
pickle.dump(random_forest,open('model.pkl','wb'))


In [63]:
import joblib

# Assuming you have 'random_forest' as your trained model
joblib.dump(random_forest, 'model.joblib')

ValueError: Non valid compression method given: "rb". Possible values are {'zlib': <joblib.compressor.ZlibCompressorWrapper object at 0x000002ABC5A87310>, 'gzip': <joblib.compressor.GzipCompressorWrapper object at 0x000002ABC4232C90>, 'bz2': <joblib.compressor.BZ2CompressorWrapper object at 0x000002ABC47B9A10>, 'lzma': <joblib.compressor.LZMACompressorWrapper object at 0x000002ABC5575590>, 'xz': <joblib.compressor.XZCompressorWrapper object at 0x000002ABC4E6D150>, 'lz4': <joblib.compressor.LZ4CompressorWrapper object at 0x000002ABC5E41FD0>}.

In [61]:
columns = df.columns.tolist()
columns

['UDI',
 'Product ID',
 'Type',
 'Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'Target',
 'Failure Type']