### This notebook is to do some basic high level EDA 

In [44]:
#Importing standard libraries
import os
import sys
import joblib
import pandas as pd
import numpy as np
import sweetviz as sv

from pathlib import Path
from imblearn.over_sampling import SMOTE,ADASYN
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator,TransformerMixin

In [2]:
Path().resolve().parent

WindowsPath('C:/Users/asus/Documents/Projects/Predictive_maintenance')

In [3]:
#importing data
data_path=Path().resolve().parent/'data/extracted'
raw_data_path=data_path/'ai4i2020.csv'
raw_df=pd.read_csv(raw_data_path)
raw_df.head(4)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0


### 

### Data exploration

In [4]:
#checking the shape of data
raw_df.shape

(10000, 14)

* Rows: 10000
* Columns: 14

In [5]:
#Missing values?
raw_df.isna().sum()

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

* There aren't any missing values in any of the columns

### Data preparation

In [6]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [7]:
# T1
numeric_cols=['Air temperature [K]','Process temperature [K]','Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']
categorical_cols=['Type','Machine failure']
numeric_cols+categorical_cols

['Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'Type',
 'Machine failure']

In [8]:
#Let's generate an eda report using sweetviz
report_path=Path().resolve().parent/'reports'
report=sv.analyze(raw_df[numeric_cols+categorical_cols])
report.show_html(report_path/'01_immrm_eda_report.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:04 -> (00:00 left)


Report C:\Users\asus\Documents\Projects\Predictive_maintenance\reports\01_immrm_eda_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


##### Observations from report:
* This is highly imbalanced dataset
* Machine failure: 0-> 9661 and 1-> 339
 

` We need to balance the dataset using some oversampling technique`
* SMOTE
* ADASYN

##### Implementing smote technique

In [9]:
#Before oversampling
Counter(raw_df['Machine failure'])

Counter({0: 9661, 1: 339})

In [10]:
X,Y=raw_df[numeric_cols+categorical_cols].drop('Machine failure',axis=1),raw_df['Machine failure']


In [11]:
# T2

#Normalizing the numerical data
#removing our target variable from this array
categorical_cols=['Type']
scale=StandardScaler()
scaled_df=scale.fit_transform(raw_df[numeric_cols])
num_std=pd.DataFrame(scaled_df,columns=numeric_cols)

#encoding the columns
encoder=OneHotEncoder()
encoded_df=encoder.fit_transform(raw_df[categorical_cols]).toarray()
encoded_df_features=encoder.get_feature_names_out(categorical_cols)

cat_std=pd.DataFrame(encoded_df,columns=encoded_df_features)

#concatenating both 
final_std_df=pd.concat([num_std,cat_std,Y],axis=1)
final_std_df


Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M,Machine failure
0,-0.952389,-0.947360,0.068185,0.282200,-1.695984,0.0,0.0,1.0,0
1,-0.902393,-0.879959,-0.729472,0.633308,-1.648852,0.0,1.0,0.0,0
2,-0.952389,-1.014761,-0.227450,0.944290,-1.617430,0.0,1.0,0.0,0
3,-0.902393,-0.947360,-0.590021,-0.048845,-1.586009,0.0,1.0,0.0,0
4,-0.902393,-0.879959,-0.729472,0.001313,-1.554588,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...
9995,-0.602417,-1.082162,0.363820,-1.052012,-1.476034,0.0,0.0,1.0,0
9996,-0.552421,-1.082162,0.520005,-0.821283,-1.428902,1.0,0.0,0.0,0
9997,-0.502425,-0.947360,0.592519,-0.660777,-1.350349,0.0,0.0,1.0,0
9998,-0.502425,-0.879959,-0.729472,0.854005,-1.303217,1.0,0.0,0.0,0


In [12]:
#generating a report post standardization
report_post_processing=sv.analyze(final_std_df)
report_post_processing.show_html(report_path/'02_immrm_pre_processing.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:04 -> (00:00 left)


Report C:\Users\asus\Documents\Projects\Predictive_maintenance\reports\02_immrm_pre_processing.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [15]:
#Generating synthetic points for balancing out the classes

#using SMOTE
smote=SMOTE(random_state=42)
X_smote,Y_smote=smote.fit_resample(final_std_df.drop('Machine failure',axis=1),final_std_df['Machine failure'])



In [16]:
Counter(Y_smote)

Counter({0: 9661, 1: 9661})

In [17]:
#using ADASYN
ada=ADASYN(random_state=42)
X_ada,Y_ada=ada.fit_resample(final_std_df.drop('Machine failure',axis=1),final_std_df['Machine failure'])
Counter(Y_ada)

Counter({0: 9661, 1: 9657})

* So we now have balanced data 

#### Modelling

In [26]:
X_smote_train,X_smote_test,Y_smote_train,Y_smote_test=train_test_split(X_smote,Y_smote,test_size=0.7,random_state=42)

X_ada_train,X_ada_test,Y_ada_train,Y_ada_test=train_test_split(X_ada,Y_ada,test_size=0.7,random_state=42)
#splitting done

In [24]:
#KNN Classifier

#on smote data
knn_smote=KNeighborsClassifier()
model_knn=knn_smote.fit(X_smote_train,Y_smote_train)
Y_smote_knn_train_pred=model_knn.predict(X_smote_train)
Y_smote_knn_test_pred=model_knn.predict(X_smote_test)

In [25]:
print("Train accuracy of KNN model: {0}".format(accuracy_score(Y_smote_train,Y_smote_knn_train_pred)))
print("Test accuracy of KNN model: {0}".format(accuracy_score(Y_smote_test,Y_smote_knn_test_pred)))


Train accuracy of KNN model: 0.966183574879227
Test accuracy of KNN model: 0.9488392725121987


In [27]:
#on adasyn data
knn_ada=KNeighborsClassifier()
model_knn_ada=knn_ada.fit(X_ada_train,Y_ada_train)
Y_ada_knn_train_pred=model_knn_ada.predict(X_ada_train)
Y_ada_knn_test_pred=model_knn_ada.predict(X_ada_test)



In [28]:
print("Train accuracy of KNN model: {0}".format(accuracy_score(Y_ada_train,Y_ada_knn_train_pred)))
print("Test accuracy of KNN model: {0}".format(accuracy_score(Y_ada_test,Y_ada_knn_test_pred)))


Train accuracy of KNN model: 0.9611734253666955
Test accuracy of KNN model: 0.9432818161650521


In [29]:
#Logistic regressioin
#on smote data
log_reg_smote=LogisticRegression(random_state=42)
model_log=log_reg_smote.fit(X_smote_train,Y_smote_train)
Y_smote_log_train_pred=model_log.predict(X_smote_train)
Y_smote_log_test_pred=model_knn.predict(X_smote_test)

In [32]:
print("Train accuracy of LOG REG model: {0}".format(accuracy_score(Y_smote_train,Y_smote_log_train_pred)))
print("Test accuracy of LOG REG model: {0}".format(accuracy_score(Y_smote_test,Y_smote_log_test_pred)))


Train accuracy of LOG REG model: 0.8188405797101449
Test accuracy of LOG REG model: 0.9488392725121987


In [34]:
#on adasyn data
log_ada=LogisticRegression(random_state=42)
model_log_ada=log_ada.fit(X_ada_train,Y_ada_train)
Y_ada_log_train_pred=model_log_ada.predict(X_ada_train)
Y_ada_log_test_pred=model_log_ada.predict(X_ada_test)

In [35]:
print("Train accuracy of LOG REG model: {0}".format(accuracy_score(Y_ada_train,Y_ada_log_train_pred)))
print("Test accuracy of LOG REG model: {0}".format(accuracy_score(Y_ada_test,Y_ada_log_test_pred)))

Train accuracy of LOG REG model: 0.7955133735979293
Test accuracy of LOG REG model: 0.7925756119204319


##### Observation:
* So here we're clear that KNN algorithm works far better compared to logistic regression.

In [42]:
# Let's make a Transformation pipeline
class CustomTransformer(BaseEstimator,TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        numeric_cols=['Air temperature [K]','Process temperature [K]','Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']
        categorical_cols=['Type']
    def fit(self,X,y=None):
        return self
    def transform(self,X,Y):
        #Normalizing the numerical data
        #removing our target variable from this array
        X=X.copy()
        scale=StandardScaler()
        scaled_df=scale.fit_transform(X[numeric_cols])
        num_std=pd.DataFrame(scaled_df,columns=numeric_cols)

        #encoding the columns
        encoder=OneHotEncoder()
        encoded_df=encoder.fit_transform(X[categorical_cols]).toarray()
        encoded_df_features=encoder.get_feature_names_out(categorical_cols)

        cat_std=pd.DataFrame(encoded_df,columns=encoded_df_features)

        #concatenating both 
        final_std_df=pd.concat([num_std,cat_std,Y],axis=1)
        return final_std_df
        
        
        

In [43]:
#testing the transformer
transformer=CustomTransformer()
data=transformer.transform(raw_df.drop('Machine failure',axis=1),raw_df['Machine failure'])
data

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M,Machine failure
0,-0.952389,-0.947360,0.068185,0.282200,-1.695984,0.0,0.0,1.0,0
1,-0.902393,-0.879959,-0.729472,0.633308,-1.648852,0.0,1.0,0.0,0
2,-0.952389,-1.014761,-0.227450,0.944290,-1.617430,0.0,1.0,0.0,0
3,-0.902393,-0.947360,-0.590021,-0.048845,-1.586009,0.0,1.0,0.0,0
4,-0.902393,-0.879959,-0.729472,0.001313,-1.554588,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...
9995,-0.602417,-1.082162,0.363820,-1.052012,-1.476034,0.0,0.0,1.0,0
9996,-0.552421,-1.082162,0.520005,-0.821283,-1.428902,1.0,0.0,0.0,0
9997,-0.502425,-0.947360,0.592519,-0.660777,-1.350349,0.0,0.0,1.0,0
9998,-0.502425,-0.879959,-0.729472,0.854005,-1.303217,1.0,0.0,0.0,0


In [45]:
root=Path().resolve().parent/'models'
model_transformer=root/'transformer.joblib'
joblib.dump(transformer,model_transformer)

['C:\\Users\\asus\\Documents\\Projects\\Predictive_maintenance\\models\\transformer.joblib']