In [73]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset

In [2]:
df_raw = pd.read_csv(r"C:\Users\mo13\OneDrive\Documents\ML course\project\capstone data.csv")

In [3]:
df_raw.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,M,298.1,308.6,1551,42.8,0,0,,,,,,,,,
1,L,298.2,308.7,1408,46.3,3,0,,,,,,,,,
2,L,298.1,308.5,1498,49.4,5,0,,,,,,,,,
3,L,298.2,308.6,1433,39.5,7,0,,,,,,,,,
4,L,298.2,308.7,1408,40.0,9,0,,,,,,,,,


Because we are using Lightgbm later, we need to replace the brackets "[]" in the features names with "()", because Lightgbm does not handle Json characters

In [4]:
df_raw.columns = df_raw.columns.str.replace(r'\[', '(', regex=True).str.replace(r'\]', ')', regex=True)

We also need to replace "L", "M" and "H" with numbers so that we can fit the dataframe to the model. we will set the following:   
"L" as 1, "M" as 2, "H" as 3

In [5]:
df_raw['Type'].unique()

array(['M', 'L', 'H'], dtype=object)

In [6]:
df_raw['Type'] = df_raw['Type'].replace({'L':1, 'M':2, 'H':3})

Now let's get our dataframe ready!

In [7]:
df = df_raw.drop(['Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9','Unnamed: 10','Unnamed: 11','Unnamed: 12', 'Unnamed: 13',
                 'Unnamed: 14', 'Unnamed: 15'], axis=1)


In [8]:
df.head()

Unnamed: 0,Type,Air temperature (K),Process temperature (K),Rotational speed (rpm),Torque (Nm),Tool wear (min),Machine failure
0,2,298.1,308.6,1551,42.8,0,0
1,1,298.2,308.7,1408,46.3,3,0
2,1,298.1,308.5,1498,49.4,5,0
3,1,298.2,308.6,1433,39.5,7,0
4,1,298.2,308.7,1408,40.0,9,0


In [9]:
df.loc[df['Machine failure'] ==1]

Unnamed: 0,Type,Air temperature (K),Process temperature (K),Rotational speed (rpm),Torque (Nm),Tool wear (min),Machine failure
50,1,298.9,309.1,2861,4.6,143,1
69,1,298.9,309.0,1410,65.7,191,1
77,1,298.8,308.9,1455,41.3,208,1
160,1,298.4,308.2,1282,60.7,216,1
161,1,298.3,308.1,1412,52.3,218,1
...,...,...,...,...,...,...,...
9758,1,298.6,309.8,2271,16.2,218,1
9764,1,298.5,309.5,1294,66.7,12,1
9822,1,298.5,309.4,1360,60.9,187,1
9830,1,298.3,309.3,1337,56.1,206,1


Since we have 339 "1"s out of 10000 samples, the data is imbalanced.

Now let's separate the inputs and the output

In [10]:
x = df.iloc[:,:-1]
y = df['Machine failure']

# Dividing the data into Training, Validation and Test sets

We'll divide the data in the following proportions:

Training set: 60% of the data,
Validation set: 20% of the data,
Test set: 20% of the data

In [33]:
def split(x,y):
    trainsize = int(len(df)*0.6)
    trainplusvalsize = int(len(df)*0.2)
    
    x_train=x[:trainsize]
    x_val=x[trainsize:trainsize + trainplusvalsize]
    x_test=x[trainsize + trainplusvalsize:]
    
    y_train=y[:trainsize]
    y_val=y[trainsize:trainsize + trainplusvalsize]
    y_test=y[trainsize + trainplusvalsize:]
    return x_train, x_val, x_test, y_train, y_val,  y_test

In [34]:
x_train, x_val, x_test, y_train, y_val,  y_test = split(x,y)

# Model 1: using Lightgbm 

Since the data is imbalanced, we will apply Lightgbm() classifier which gives more weight to the minority class.

In [12]:
lgb_model = lgb.LGBMClassifier(class_weight='balanced')  
# the "class_weight" = 'balanced'" is to adjust class weights meaning that: Classes with fewer samples will receive higher weights.
# and classes with more samples will receive lower weights.

lgb_model.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 255, number of negative: 5745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 928
[LightGBM] [Info] Number of data points in the train set: 6000, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [13]:
pr = lgb_model.predict(x_train)

the function "get_report(real, predicted)" below calculates the precision, recall and f1-score for each class

In [14]:
def get_report(real, predicted):      
    return pd.DataFrame(classification_report(real, predicted, output_dict=True)).transpose().drop(index=['macro avg', 'weighted avg'])

In [15]:
get_report(y_train, pr)

Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.999478,0.999739,5745.0
1,0.988372,1.0,0.994152,255.0
accuracy,0.9995,0.9995,0.9995,0.9995


In [16]:
predicted_val = lgb_model.predict(x_val)
get_report(y_val, predicted_val)

Unnamed: 0,precision,recall,f1-score,support
0,0.990816,0.99335,0.992082,1955.0
1,0.675,0.6,0.635294,45.0
accuracy,0.9845,0.9845,0.9845,0.9845


In [17]:
predicted_test = lgb_model.predict(x_test)
get_report(y_test, predicted_test)

Unnamed: 0,precision,recall,f1-score,support
0,0.993881,0.993881,0.993881,1961.0
1,0.692308,0.692308,0.692308,39.0
accuracy,0.988,0.988,0.988,0.988


the model on test set shows an acceptable f1-score (even better than f1-score on validation set) for an unbalanced data like this one, which is close to 70% f1-score

# Model 2: Using Decision Trees Classifier

First, since we have imbalanced data, we will apply oversampling to the minority class using SMOTE()

In [23]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)   

# x_resampled is resampled x_train
# y_resampled is resampled y_train

Now, let's build the decision trees model

In [26]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_resampled, y_resampled)

predicted_val_dt = dt_model.predict(x_val)    # applying the model to the validation set
get_report(y_val, predicted_val_dt)

Unnamed: 0,precision,recall,f1-score,support
0,0.99085,0.941688,0.965644,1955.0
1,0.197183,0.622222,0.299465,45.0
accuracy,0.9345,0.9345,0.9345,0.9345


the model is applied to the validation set, and based on its classification report above, the F1 score is low (with acceptable recall but very low precision)

# Model 3: Using Random Forest Classifier

First, we will try to build the random forest model using the oversampled data (above)

In [28]:
rf_model = RandomForestClassifier()
rf_model.fit(x_resampled, y_resampled)

predicted_val_rf = rf_model.predict(x_val) # applying the model to the validation set
get_report(y_val, predicted_val_rf)

Unnamed: 0,precision,recall,f1-score,support
0,0.995722,0.95243,0.973595,1955.0
1,0.284615,0.822222,0.422857,45.0
accuracy,0.9495,0.9495,0.9495,0.9495


Now, let's try building the model with the original data

In [29]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
predicted_val_rf = rf_model.predict(x_val) # applying the model to the validation set
get_report(y_val, predicted_val_rf)

Unnamed: 0,precision,recall,f1-score,support
0,0.985844,0.997442,0.991609,1955.0
1,0.772727,0.377778,0.507463,45.0
accuracy,0.9835,0.9835,0.9835,0.9835


when building the model using the oversampled data, we get a good recall for the minority class but low precision
However, when building the model using the original data, we get a good precision but very low recall. Thus, we get a better F1 score for the model when using the original data rather than oversampled data, but still low (50.7%) 

# Model 4: Neural Networks

Since NN could be sensitive to imbalanced data, we will build it using the oversampled data

Now, we will use RobustScaler() as our standerdized choice because it could be better for imbalanced data

In [94]:
robust = RobustScaler()
X_scaled = robust.fit_transform(x_resampled)

In [95]:
xs_train, xs_val, xs_test, ys_train, ys_val, ys_test = split(X_scaled,y_resampled)   

# xs_train is scaled x_resampled (which is resampled x_train) 
# xs_val is scaled x_val
# xs_test is scaled x_test

let's convert the data above to tensor

In [96]:
x_train_tensor = torch.FloatTensor(xs_train)
y_train_tensor = torch.FloatTensor(ys_train.to_numpy()).view(-1,1)  # Reshape for binary output

x_val_tensor = torch.FloatTensor(xs_val)
y_val_tensor = torch.FloatTensor(ys_val.to_numpy()).view(-1,1)

x_test_tensor = torch.FloatTensor(xs_test)
y_test_tensor = torch.FloatTensor(ys_test.to_numpy()).view(-1,1)

In [97]:
numeber_of_features = x_train_tensor.shape[1]

Let's build the NN

In [104]:
class Net(nn.Module):
    def __init__(self, n_classes=100):
        super(Net, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(numeber_of_features, 180),
            nn.ReLU(),
            nn.Linear(180, 60),
        
            nn.Linear(60, 1),
            nn.Sigmoid(),
        )


    def forward(self, x):
         return self.model(x)

nn_model = Net()

The optimizer used below is BCEWithLogitsLoss() becuase it is suitable for binary outputs

In [105]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(nn_model.parameters(), lr=0.001, momentum=0.9)

Now, we will train the NN

In [106]:
n_epochs = 10

for epoch in range(n_epochs):
    train_loss = 0.0
    
    optimizer.zero_grad()
    output = nn_model(x_train_tensor)
            
    loss = criterion(output, y_train_tensor)
    loss.backward()
            
    optimizer.step()
            
    train_loss += loss.item()*x_train_tensor.size(0)
            
    train_loss = train_loss/len(TensorDataset(x_train_tensor, y_train_tensor))  
    print(f'Epoch {epoch + 1}/{n_epochs}, Loss: {train_loss:.4f}')

Epoch 1/10, Loss: 0.9425
Epoch 2/10, Loss: 0.9423
Epoch 3/10, Loss: 0.9421
Epoch 4/10, Loss: 0.9417
Epoch 5/10, Loss: 0.9412
Epoch 6/10, Loss: 0.9407
Epoch 7/10, Loss: 0.9400
Epoch 8/10, Loss: 0.9393
Epoch 9/10, Loss: 0.9385
Epoch 10/10, Loss: 0.9377


Let's apply our model to the validation set

In [107]:
nn_model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    y_pred_prob = nn_model(x_val_tensor)
    y_pred_nn = (y_pred_prob > 0.5).float()
y_pred_nn

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [0.],
        [1.]])

In [108]:
y_val_numpy = y_val_tensor.numpy()  # Convert it to numpy array to get the classification report
y_pred_numpy = y_pred_nn.numpy()    # Convert it to numpy array to get the classification report

In [109]:
get_report(y_test_numpy, y_pred_numpy)

Unnamed: 0,precision,recall,f1-score,support
0.0,0.982456,0.342682,0.508129,1961.0
1.0,0.020517,0.692308,0.039852,39.0
accuracy,0.3495,0.3495,0.3495,0.3495


the recall of this model is acceptable but the precision is really poor, so the F1 score is really poor for this model

# Discussion

we trained four models on our imbalanced data, and applied the models on the validation set. We found out that the best model in this case is Lightgbm since it gave F1 score of 63.5% compared to the other models which were poor

We then tested the lightgbm model on the test set and we got even a better F1 score which was 69%

Therefore, I will choose model 1 (i.e. Lightgbm model)