# Model Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print('Libraries imported.')

Libraries imported.


In [2]:
df = pd.read_csv('../data/processed/data_processed.csv')
df.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,1.0,0,0.222934,0.535714,0.0,0.304348,0.358025,5
1,0.0,0,0.139697,0.583791,0.011858,0.315217,0.37037,5
2,0.0,0,0.192084,0.626374,0.019763,0.304348,0.345679,5
3,0.0,0,0.154249,0.490385,0.027668,0.315217,0.358025,5
4,0.0,0,0.139697,0.497253,0.035573,0.315217,0.37037,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57912 entries, 0 to 57911
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     57912 non-null  float64
 1   Machine failure          57912 non-null  int64  
 2   Rotational speed [rpm]   57912 non-null  float64
 3   Torque [Nm]              57912 non-null  float64
 4   Tool wear [min]          57912 non-null  float64
 5   Air temperature [c]      57912 non-null  float64
 6   Process temperature [c]  57912 non-null  float64
 7   type_of_failure          57912 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 3.5 MB


## Train Test Split

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['Machine failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Machine Failure

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred) * 100
    rec = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    scores.append([acc, prec, rec, f1])


In [6]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,83.458517,86.403619,89.182879,87.771254
1,SVC,96.054563,94.887981,99.429313,97.10558
2,Decision Tree,99.110766,98.970001,99.701686,99.334496
3,Random Forest,99.231633,98.921556,99.935149,99.425769


In [7]:
best_model_idx = scores_df['F1'].idxmax()
best_model = scores_df.loc[best_model_idx, 'Model']
best_model

'Random Forest'

In [10]:
best_model = models[best_model_idx]

In [11]:
report = classification_report(y_test, best_model.predict(X_test), output_dict=True)
print(report)
report  = pd.DataFrame(report).transpose()
obj  = report.to_json()
obj


{'0': {'precision': 0.9986821296784396, 'recall': 0.9783113865220759, 'f1-score': 0.9883918090517805, 'support': 3873}, '1': {'precision': 0.9892155604057004, 'recall': 0.9993514915693904, 'f1-score': 0.9942576940447772, 'support': 7710}, 'accuracy': 0.992316325649659, 'macro avg': {'precision': 0.9939488450420699, 'recall': 0.9888314390457331, 'f1-score': 0.9913247515482788, 'support': 11583}, 'weighted avg': {'precision': 0.9923808908721874, 'recall': 0.992316325649659, 'f1-score': 0.9922963219841818, 'support': 11583}}


'{"precision":{"0":0.9986821297,"1":0.9892155604,"accuracy":0.9923163256,"macro avg":0.993948845,"weighted avg":0.9923808909},"recall":{"0":0.9783113865,"1":0.9993514916,"accuracy":0.9923163256,"macro avg":0.988831439,"weighted avg":0.9923163256},"f1-score":{"0":0.9883918091,"1":0.994257694,"accuracy":0.9923163256,"macro avg":0.9913247515,"weighted avg":0.992296322},"support":{"0":3873.0,"1":7710.0,"accuracy":0.9923163256,"macro avg":11583.0,"weighted avg":11583.0}}'

In [12]:
rep = pd.read_json(obj)
rep

Unnamed: 0,precision,recall,f1-score,support
0,0.998682,0.978311,0.988392,3873.0
1,0.989216,0.999351,0.994258,7710.0
accuracy,0.992316,0.992316,0.992316,0.992316
macro avg,0.993949,0.988831,0.991325,11583.0
weighted avg,0.992381,0.992316,0.992296,11583.0


In [13]:
type = 'M'

if type == 'L':
    type = 0
elif type == 'M':
    type = 1
elif type == 'H':
    type = 2

type = float(type)
type

1.0

In [14]:
best_model.predict([[type,0.175738,0.477421,0.823187,0.363062,0.352309]])

array([1])

In [15]:
df.iloc[57907]

Type                       1.000000
Machine failure            1.000000
Rotational speed [rpm]     0.515315
Torque [Nm]                0.212765
Tool wear [min]            0.867413
Air temperature [c]        0.586957
Process temperature [c]    0.628594
type_of_failure            4.000000
Name: 57907, dtype: float64

Random Forest Classifier is th best performing model.

## Train Test Split

In [16]:
X = df.drop(['Machine failure', 'type_of_failure'], axis=1)
y = df['type_of_failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Type of Failure

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred, average='macro') * 100
    rec = recall_score(y_test, y_pred, average='macro') * 100
    f1 = f1_score(y_test, y_pred, average='macro') * 100
    scores.append([acc, prec, rec, f1])

In [18]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,83.579384,82.877003,83.641543,83.128912
1,SVC,94.146594,94.376026,94.212137,93.946938
2,Decision Tree,98.653199,98.654525,98.666208,98.654019
3,Random Forest,99.179833,99.190728,99.191587,99.180149


Random Forest Classifier is th best performing model.