# Data Preprocessing

1. Check for missing values: Check if there are any missing values in the dataset and decide on how to handle them. If there are a lot of missing values, you may consider dropping those rows or imputing them with appropriate values.

2. Check for duplicates: Check if there are any duplicate rows in the dataset and remove them if necessary.

3. Data type conversion: Check if the data types of the columns are appropriate. For example, the 'UID' column should be of integer data type, while the 'productID' column should be categorical.

4. Check for outliers: Check if there are any outliers in the dataset, especially in the continuous variables such as 'air temperature', 'process temperature', 'rotational speed', 'torque', and 'tool wear'. You may consider removing or adjusting the outliers depending on the context.

5. Feature engineering: Create new features that may be relevant for predictive maintenance. For example, you may create a new feature that combines the 'air temperature' and 'process temperature' to represent the temperature difference, which may be an important indicator of machine failure.

6. Label encoding: Convert the categorical variable 'productID' to numerical values using label encoding or one-hot encoding, depending on the algorithm you plan to use.

7. Feature scaling: Normalize or standardize the continuous variables to ensure that they have similar ranges. This will help the machine learning algorithm to converge faster.

8. Balance the dataset: Check if the dataset is balanced in terms of the 'machine failure' label. If there are a lot more non-failure instances than failure instances, you may consider oversampling or undersampling to balance the dataset.

9.  Save the preprocessed dataset: Save the preprocessed dataset in a suitable format, such as CSV or Parquet, for future use.

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print('Libraries imported')

Libraries imported


In [24]:
df = pd.read_csv('../data/raw/data.csv')
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


## Missing values

In [25]:
df.isna().sum()/len(df)*100

UDI                        0.0
Product ID                 0.0
Type                       0.0
Air temperature [K]        0.0
Process temperature [K]    0.0
Rotational speed [rpm]     0.0
Torque [Nm]                0.0
Tool wear [min]            0.0
Machine failure            0.0
TWF                        0.0
HDF                        0.0
PWF                        0.0
OSF                        0.0
RNF                        0.0
dtype: float64

no missing values

## Check for duplicates

In [26]:
df['Product ID'].duplicated().sum()

0

In [27]:
df['UDI'].duplicated().sum()

0

no duplicates found

## Data type conversion

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

## Feature engineering

### creating 'type of failure' feature

In [29]:
def type_of_failure(row_name):
    if df.loc[row_name, 'TWF'] == 1:
        df.loc[row_name, 'type_of_failure'] = 'TWF'
    elif df.loc[row_name, 'HDF'] == 1:
        df.loc[row_name, 'type_of_failure'] = 'HDF'
    elif df.loc[row_name, 'PWF'] == 1:
        df.loc[row_name, 'type_of_failure'] = 'PWF'
    elif df.loc[row_name, 'OSF'] == 1:
        df.loc[row_name, 'type_of_failure'] = 'OSF'
    elif df.loc[row_name, 'RNF'] == 1:
        df.loc[row_name, 'type_of_failure'] = 'RNF'

df.apply(lambda row: type_of_failure(row.name), axis=1)
df['type_of_failure'].replace(np.NaN, 'no failure', inplace=True)
df.drop(['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1, inplace=True)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,type_of_failure
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,no failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,no failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,no failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,no failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,no failure


In [30]:
df.drop(['UDI', 'Product ID'], axis=1, inplace=True)
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,type_of_failure
0,M,298.1,308.6,1551,42.8,0,0,no failure
1,L,298.2,308.7,1408,46.3,3,0,no failure
2,L,298.1,308.5,1498,49.4,5,0,no failure
3,L,298.2,308.6,1433,39.5,7,0,no failure
4,L,298.2,308.7,1408,40.0,9,0,no failure


### converting Kelvin to Celsius

In [31]:
df['Air temperature [c]'] = df['Air temperature [K]'] - 273.15
df['Process temperature [c]'] = df['Process temperature [K]'] - 273.15
df.drop(['Air temperature [K]', 'Process temperature [K]'], axis=1, inplace=True)
df.head()

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,type_of_failure,Air temperature [c],Process temperature [c]
0,M,1551,42.8,0,0,no failure,24.95,35.45
1,L,1408,46.3,3,0,no failure,25.05,35.55
2,L,1498,49.4,5,0,no failure,24.95,35.35
3,L,1433,39.5,7,0,no failure,25.05,35.45
4,L,1408,40.0,9,0,no failure,25.05,35.55


## Categorical Encoding

### Ordinal Encoding

In [32]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[['L', 'M', 'H']])

df['Type'] = encoder.fit_transform(df[['Type']])
df.head()

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,type_of_failure,Air temperature [c],Process temperature [c]
0,1.0,1551,42.8,0,0,no failure,24.95,35.45
1,0.0,1408,46.3,3,0,no failure,25.05,35.55
2,0.0,1498,49.4,5,0,no failure,24.95,35.35
3,0.0,1433,39.5,7,0,no failure,25.05,35.45
4,0.0,1408,40.0,9,0,no failure,25.05,35.55


In [33]:
df['Type'].value_counts()

Type
0.0    6000
1.0    2997
2.0    1003
Name: count, dtype: int64

### Label Encoding

In [34]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['type_of_failure'] = encoder.fit_transform(df['type_of_failure'])
df.head()

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,type_of_failure,Air temperature [c],Process temperature [c]
0,1.0,1551,42.8,0,0,5,24.95,35.45
1,0.0,1408,46.3,3,0,5,25.05,35.55
2,0.0,1498,49.4,5,0,5,24.95,35.35
3,0.0,1433,39.5,7,0,5,25.05,35.45
4,0.0,1408,40.0,9,0,5,25.05,35.55


In [35]:
df.loc[df['Machine failure'] == 1]

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,type_of_failure,Air temperature [c],Process temperature [c]
50,0.0,2861,4.6,143,1,2,25.75,35.95
69,0.0,1410,65.7,191,1,2,25.75,35.85
77,0.0,1455,41.3,208,1,4,25.65,35.75
160,0.0,1282,60.7,216,1,1,25.25,35.05
161,0.0,1412,52.3,218,1,1,25.15,34.95
...,...,...,...,...,...,...,...,...
9758,0.0,2271,16.2,218,1,4,25.45,36.65
9764,0.0,1294,66.7,12,1,2,25.35,36.35
9822,0.0,1360,60.9,187,1,1,25.35,36.25
9830,0.0,1337,56.1,206,1,1,25.15,36.15


In [36]:
df.iloc[69]

Type                          0.00
Rotational speed [rpm]     1410.00
Torque [Nm]                  65.70
Tool wear [min]             191.00
Machine failure               1.00
type_of_failure               2.00
Air temperature [c]          25.75
Process temperature [c]      35.85
Name: 69, dtype: float64

In [45]:
encoder.classes_

array(['HDF', 'OSF', 'PWF', 'RNF', 'TWF', 'no failure'], dtype=object)

In [37]:
classes = [0, 1, 2, 3,4, 5]
encoder.inverse_transform(classes)

array(['HDF', 'OSF', 'PWF', 'RNF', 'TWF', 'no failure'], dtype=object)

In [38]:
df['type_of_failure'].value_counts()

type_of_failure
5    9652
0     115
2      91
1      78
4      46
3      18
Name: count, dtype: int64

In [39]:
df.iloc[6000]

Type                          2.00
Rotational speed [rpm]     1431.00
Torque [Nm]                  43.20
Tool wear [min]               0.00
Machine failure               0.00
type_of_failure               5.00
Air temperature [c]          27.25
Process temperature [c]      37.05
Name: 6000, dtype: float64

## Feature Scaling

In [40]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scale_cols = ['Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Air temperature [c]', 'Process temperature [c]']

df_scaled = scaler.fit_transform(df[scale_cols])
df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = scale_cols

df.drop(scale_cols, axis=1, inplace=True)

df_scaled = pd.concat([df, df_scaled], axis=1)
df_scaled.head()

Unnamed: 0,Type,Machine failure,type_of_failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c]
0,1.0,0,5,0.222934,0.535714,0.0,0.304348,0.358025
1,0.0,0,5,0.139697,0.583791,0.011858,0.315217,0.37037
2,0.0,0,5,0.192084,0.626374,0.019763,0.304348,0.345679
3,0.0,0,5,0.154249,0.490385,0.027668,0.315217,0.358025
4,0.0,0,5,0.139697,0.497253,0.035573,0.315217,0.37037


In [41]:
df_scaled.iloc[69]

Type                       0.000000
Machine failure            1.000000
type_of_failure            2.000000
Rotational speed [rpm]     0.140861
Torque [Nm]                0.850275
Tool wear [min]            0.754941
Air temperature [c]        0.391304
Process temperature [c]    0.407407
Name: 69, dtype: float64

## Oversampling

In [42]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto')

X = df_scaled.drop('type_of_failure', axis=1)
y = df_scaled['type_of_failure']

X_resampled, y_resampled = smote.fit_resample(X, y)

df_sampled = pd.concat([X_resampled, y_resampled], axis=1)
df_sampled.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,1.0,0,0.222934,0.535714,0.0,0.304348,0.358025,5
1,0.0,0,0.139697,0.583791,0.011858,0.315217,0.37037,5
2,0.0,0,0.192084,0.626374,0.019763,0.304348,0.345679,5
3,0.0,0,0.154249,0.490385,0.027668,0.315217,0.358025,5
4,0.0,0,0.139697,0.497253,0.035573,0.315217,0.37037,5


In [43]:
df_sampled.value_counts('type_of_failure')

type_of_failure
0    9652
1    9652
2    9652
3    9652
4    9652
5    9652
Name: count, dtype: int64

In [44]:
df_sampled.to_csv('../data/processed/data_processed.csv', index=False)