In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report


warnings.filterwarnings('ignore')
print('Libraries imported.')

Libraries imported.


In [3]:
processed_df = pd.read_csv('./data/processed/data_processed.csv')
processed_df.head()

Unnamed: 0,Type,Machine failure,Power,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [C],Process temperature [C],TYPE_OF_FAILURE
0,1.0,0,66382.8,0.222934,0.535714,0.0,0.304348,0.358025,1
1,0.0,0,65190.4,0.139697,0.583791,0.011858,0.315217,0.37037,1
2,0.0,0,74001.2,0.192084,0.626374,0.019763,0.304348,0.345679,1
3,0.0,0,56603.5,0.154249,0.490385,0.027668,0.315217,0.358025,1
4,0.0,0,56320.0,0.139697,0.497253,0.035573,0.315217,0.37037,1


In [5]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57912 entries, 0 to 57911
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     57912 non-null  float64
 1   Machine failure          57912 non-null  int64  
 2   Power                    57912 non-null  float64
 3   Rotational speed [rpm]   57912 non-null  float64
 4   Torque [Nm]              57912 non-null  float64
 5   Tool wear [min]          57912 non-null  float64
 6   Air temperature [C]      57912 non-null  float64
 7   Process temperature [C]  57912 non-null  float64
 8   TYPE_OF_FAILURE          57912 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 4.0 MB


In [6]:
processed_df.drop(['Rotational speed [rpm]', 'Torque [Nm]'],axis=1, inplace=True)

In [8]:

X = processed_df.drop(['Machine failure', 'TYPE_OF_FAILURE'], axis=1)
y = processed_df['Machine failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Machine Failure

In [16]:
lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred) * 100
    rec = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    scores.append([acc, prec, rec, f1])

In [17]:
scores_df = pd.DataFrame(data=scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'], index=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])

In [18]:
scores_df

Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,66.563067,66.563067,100.0,79.925362
SVC,70.34447,76.890175,79.273671,78.063733
Decision Tree,98.955366,98.992899,99.442283,99.217082
Random Forest,99.274799,99.136598,99.779507,99.457014


In [20]:
scores_df['F1'].idxmax()

'Random Forest'

In [22]:
best_model_name = scores_df['F1'].idxmax()
best_model = models[scores_df.index.get_loc(best_model_name)]
best_model

In [23]:
report = classification_report(y_test, best_model.predict(X_test), output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.995553,0.982701,0.989085,3873.0
1,0.991366,0.997795,0.99457,7710.0
accuracy,0.992748,0.992748,0.992748,0.992748
macro avg,0.99346,0.990248,0.991828,11583.0
weighted avg,0.992766,0.992748,0.992736,11583.0


In [24]:
obj = report_df.to_json()
print(obj)

{"precision":{"0":0.9955532304,"1":0.9913659794,"accuracy":0.9927479927,"macro avg":0.9934596049,"weighted avg":0.9927660677},"recall":{"0":0.9827007488,"1":0.9977950713,"accuracy":0.9927479927,"macro avg":0.9902479101,"weighted avg":0.9927479927},"f1-score":{"0":0.9890852391,"1":0.9945701357,"accuracy":0.9927479927,"macro avg":0.9918276874,"weighted avg":0.9927361545},"support":{"0":3873.0,"1":7710.0,"accuracy":0.9927479927,"macro avg":11583.0,"weighted avg":11583.0}}


In [25]:
best_model.predict(X_test)

array([1, 1, 0, ..., 1, 0, 0])

In [26]:
report = classification_report(y_test, best_model.predict(X_test), output_dict=True)
print(report)
report  = pd.DataFrame(report).transpose()
obj  = report.to_json()
obj

{'0': {'precision': 0.9955532304472927, 'recall': 0.9827007487735605, 'f1-score': 0.989085239085239, 'support': 3873.0}, '1': {'precision': 0.9913659793814433, 'recall': 0.9977950713359274, 'f1-score': 0.9945701357466064, 'support': 7710.0}, 'accuracy': 0.9927479927479927, 'macro avg': {'precision': 0.993459604914368, 'recall': 0.9902479100547439, 'f1-score': 0.9918276874159226, 'support': 11583.0}, 'weighted avg': {'precision': 0.9927660677331687, 'recall': 0.9927479927479927, 'f1-score': 0.9927361545008604, 'support': 11583.0}}


'{"precision":{"0":0.9955532304,"1":0.9913659794,"accuracy":0.9927479927,"macro avg":0.9934596049,"weighted avg":0.9927660677},"recall":{"0":0.9827007488,"1":0.9977950713,"accuracy":0.9927479927,"macro avg":0.9902479101,"weighted avg":0.9927479927},"f1-score":{"0":0.9890852391,"1":0.9945701357,"accuracy":0.9927479927,"macro avg":0.9918276874,"weighted avg":0.9927361545},"support":{"0":3873.0,"1":7710.0,"accuracy":0.9927479927,"macro avg":11583.0,"weighted avg":11583.0}}'

In [27]:
rep = pd.read_json(obj)
rep

Unnamed: 0,precision,recall,f1-score,support
0,0.995553,0.982701,0.989085,3873.0
1,0.991366,0.997795,0.99457,7710.0
accuracy,0.992748,0.992748,0.992748,0.992748
macro avg,0.99346,0.990248,0.991828,11583.0
weighted avg,0.992766,0.992748,0.992736,11583.0


In [28]:
type = 'M'

if type == 'L':
        type = 0
elif type == 'M':
    type = 1
elif type == 'H':
    type = 2

type = float(type)
type

1.0

In [30]:
best_model.predict([[0.175738,0.477421,0.823187,0.363062,0.352309]])

array([1])

In [31]:
processed_df.iloc[57907]

Type                           0.950308
Machine failure                1.000000
Power                      64588.442649
Tool wear [min]                0.815518
Air temperature [C]            0.440791
Process temperature [C]        0.492600
TYPE_OF_FAILURE                5.000000
Name: 57907, dtype: float64

Random Forest Classifier is th best performing model.

## Train Test Split

In [32]:
X = processed_df.drop(['Machine failure', 'TYPE_OF_FAILURE'], axis=1)
y = processed_df['TYPE_OF_FAILURE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Predicting Type of Failure

In [33]:
lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

models = [lr, svc, dt, rf]
scores = []

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred, average='macro') * 100
    rec = recall_score(y_test, y_pred, average='macro') * 100
    f1 = f1_score(y_test, y_pred, average='macro') * 100
    scores.append([acc, prec, rec, f1])

In [34]:
scores_df = pd.DataFrame(columns=['Model'], data=['Logistic Regression', 'SVC', 'Decision Tree', 'Random Forest'])
scores_df = pd.concat([scores_df, pd.DataFrame(scores, columns=['Accuracy', 'Precision', 'Recall', 'F1'])], axis=1)
scores_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,16.68825,2.781375,16.666667,4.76719
1,SVC,51.851852,43.731689,51.730842,44.625722
2,Decision Tree,98.592765,98.587688,98.600501,98.59123
3,Random Forest,99.222999,99.222782,99.230443,99.221912


Random Forest Classifier is the best performing model.