# 1. Loading the csv file

In [1]:
!unzip /content/train.csv.zip -d ./

Archive:  /content/train.csv.zip
  inflating: ./train.csv             


# 2. Removing the columns "id" and "Product ID"

In [2]:
import pandas as pd

# uplode the csv
train_df = pd.read_csv('/content/train.csv')
train_df = train_df.drop(['id','Product ID'], axis=1)
train_df

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
136424,M,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,H,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,L,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,L,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


# 3. Defining the target variable as "machine failure"

In [3]:

machine_failure_0 = train_df[train_df['Machine failure'] == 0]
print("Machine failure group '0' : %s " %machine_failure_0.shape[0])

machine_failure_1 = train_df[train_df['Machine failure'] == 1]
print("Machine failure group '1' : %s " %machine_failure_1.shape[0])

Machine failure group '0' : 134281 
Machine failure group '1' : 2148 


# 4. Converting the categorical variable into a numerical variable


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

label_encoder = LabelEncoder()
train_df['Type'] = label_encoder.fit_transform(train_df['Type'])

# train_df['Type'] = train_df['Type'].replace({'H':0 ,'M':1, 'L':2})
train_df

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
136424,2,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,0,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,1,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,1,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


# 5. Normalize the data

In [5]:
from sklearn.preprocessing import MinMaxScaler

s = MinMaxScaler()
scaled_data = s.fit_transform(train_df)
scaled_df = pd.DataFrame(scaled_data, columns=train_df.columns)
scaled_df


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0.5,0.582418,0.4750,0.243402,0.443681,0.553360,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.802198,0.7875,0.339003,0.347527,0.790514,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,0.439560,0.3375,0.365982,0.311813,0.098814,0.0,0.0,0.0,0.0,0.0,0.0
3,0.5,0.626374,0.6375,0.201173,0.556319,0.778656,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.296703,0.4000,0.269795,0.434066,0.134387,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
136424,1.0,0.527473,0.7000,0.204692,0.462912,0.830040,0.0,0.0,0.0,0.0,0.0,0.0
136425,0.0,0.241758,0.3375,0.156012,0.622253,0.007905,0.0,0.0,0.0,0.0,0.0,0.0
136426,0.5,0.571429,0.7500,0.201173,0.476648,0.845850,0.0,0.0,0.0,0.0,0.0,0.0
136427,0.5,0.703297,0.6375,0.156012,0.583791,0.166008,0.0,0.0,0.0,0.0,0.0,0.0


# 6. Split the data to Train & Test

In [30]:
from sklearn.model_selection import train_test_split
x = scaled_df.drop(['Machine failure'], axis =1)
y = scaled_df['Machine failure']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

# 7. Training the logistic regression model

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

results = []

logreg = LogisticRegression()
logreg.fit(xtrain,ytrain)

print("coef : " , logreg.coef_)
print("intercept :", logreg.intercept_[0])

y_train_pred = logreg.predict(xtrain)
y_test_pred = logreg.predict(xtest)


results.append(("LogisticRegression", "Train", "All Features",
       accuracy_score(ytrain, y_train_pred),
       precision_score(ytrain, y_train_pred, pos_label=1, zero_division=0),
       recall_score(ytrain, y_train_pred, pos_label=1, zero_division=0),
       confusion_matrix(ytrain, y_train_pred)))


results.append(("LogisticRegression", "Test", "All Features",
       accuracy_score(ytest, y_test_pred),
       precision_score(ytest, y_test_pred,pos_label = 1, zero_division=0),
       recall_score(ytest, y_test_pred,pos_label = 1, zero_division=0),
       confusion_matrix(ytest, y_test_pred)))

results_df = pd.DataFrame(results, columns=["Model", "Train/Test", "Configuration", "Accuracy", "Precision", "Recall","Confusion Matrix"])
display(results_df)


coef :  [[-0.42724932  2.6515951  -1.54361874  1.91234209  5.99637625  1.25467066
   8.80666     8.08065176  9.17788897  7.74249689 -0.01684327]]
intercept : -9.936131198261497


Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
0,LogisticRegression,Train,All Features,0.996188,0.99239,0.762573,"[[107423, 10], [406, 1304]]"
1,LogisticRegression,Test,All Features,0.996152,0.99115,0.767123,"[[26845, 3], [102, 336]]"


# 8. Feature engineering , preprocess, split and Classification Report :



In [38]:
df_8 = scaled_df.drop(['TWF', 'HDF','PWF', 'OSF', 'RNF'], axis=1)

x1 = df_8.drop(['Machine failure'], axis =1)
y1 = df_8['Machine failure']

xtrain8, xtest8, ytrain8, ytest8 = train_test_split(x1, y1, test_size=0.2, random_state=42)

logreg_8 = LogisticRegression()
logreg_8.fit(xtrain8,ytrain8)

print("coef : " , logreg_8.coef_)
print("intercept :", logreg_8.intercept_[0])

y_train_pred8 = logreg_8.predict(xtrain8)
y_test_pred8 = logreg_8.predict(xtest8)

results.append(("LogisticRegression", "Train", "Without Binary Vars",
       accuracy_score(ytrain8, y_train_pred8),
       precision_score(ytrain8, y_train_pred8, pos_label=1, zero_division=0),
       recall_score(ytrain8, y_train_pred8, pos_label=1, zero_division=0),
       confusion_matrix(ytrain8, y_train_pred8)))


results.append(("LogisticRegression", "Test", "Without Binary Vars",
       accuracy_score(ytest8, y_test_pred8),
       precision_score(ytest8, y_test_pred8,pos_label = 1, zero_division=0),
       recall_score(ytest8, y_test_pred8,pos_label = 1, zero_division=0),
       confusion_matrix(ytest8, y_test_pred8)))


results_df = pd.DataFrame(results, columns=["Model", "Train/Test", "Configuration", "Accuracy", "Precision", "Recall","Confusion Matrix"])
display(results_df)


coef :  [[-0.21341729  5.65686931 -4.23340376  4.87281891 11.28798545  1.94640648]]
intercept : -12.978196185960767


Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
0,LogisticRegression,Train,All Features,0.996188,0.99239,0.762573,"[[107423, 10], [406, 1304]]"
1,LogisticRegression,Test,All Features,0.996152,0.99115,0.767123,"[[26845, 3], [102, 336]]"
2,LogisticRegression,Train,Without Binary Vars,0.984617,0.77193,0.025731,"[[107420, 13], [1666, 44]]"
3,LogisticRegression,Test,Without Binary Vars,0.984241,0.833333,0.022831,"[[26846, 2], [428, 10]]"


# 9. Using additional models to improve the results : KNN , Decision Tree

In [39]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


def evaluate_model(results, model, x_train, y_train, x_test, y_test, description):
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    results.append((model.__class__.__name__, "Train", description,
       accuracy_score(ytrain, y_train_pred),
       precision_score(ytrain, y_train_pred, pos_label=1, zero_division=0),
       recall_score(ytrain, y_train_pred, pos_label=1, zero_division=0),
       confusion_matrix(ytrain, y_train_pred)))

    results.append((model.__class__.__name__, "Test", description,
       accuracy_score(ytest, y_test_pred),
       precision_score(ytest, y_test_pred,pos_label = 1, zero_division=0),
       recall_score(ytest, y_test_pred,pos_label = 1, zero_division=0),
       confusion_matrix(ytest, y_test_pred)))

    return results


for neighbors in [3, 5, 11, 13]:
    knn = KNeighborsClassifier(n_neighbors= neighbors)
    evaluate_model(results, knn, xtrain8, ytrain8, xtest8, ytest8, f"N={neighbors}")

for depth in [3, 5, 11, None]:
    dt = DecisionTreeClassifier(max_depth=depth)
    evaluate_model(results, dt, xtrain8, ytrain8, xtest8, ytest8, f"depth={depth}")

results_df = pd.DataFrame(results, columns=["Model", "Train/Test", "Configuration", "Accuracy", "Precision", "Recall","Confusion Matrix"])
display(results_df)

Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
0,LogisticRegression,Train,All Features,0.996188,0.99239,0.762573,"[[107423, 10], [406, 1304]]"
1,LogisticRegression,Test,All Features,0.996152,0.99115,0.767123,"[[26845, 3], [102, 336]]"
2,LogisticRegression,Train,Without Binary Vars,0.984617,0.77193,0.025731,"[[107420, 13], [1666, 44]]"
3,LogisticRegression,Test,Without Binary Vars,0.984241,0.833333,0.022831,"[[26846, 2], [428, 10]]"
4,KNeighborsClassifier,Train,N=3,0.989051,0.785161,0.41462,"[[107239, 194], [1001, 709]]"
5,KNeighborsClassifier,Test,N=3,0.983984,0.502646,0.216895,"[[26754, 94], [343, 95]]"
6,KNeighborsClassifier,Train,N=5,0.988062,0.786217,0.326901,"[[107281, 152], [1151, 559]]"
7,KNeighborsClassifier,Test,N=5,0.984717,0.573427,0.187215,"[[26787, 61], [356, 82]]"
8,KNeighborsClassifier,Train,N=11,0.986147,0.733491,0.181871,"[[107320, 113], [1399, 311]]"
9,KNeighborsClassifier,Test,N=11,0.985084,0.659794,0.146119,"[[26815, 33], [374, 64]]"


# 10. The final results of all the models from section 8 and 9

In [40]:
results_df = results_df[results_df["Configuration"] != "All Features"]
display(results_df)

Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
2,LogisticRegression,Train,Without Binary Vars,0.984617,0.77193,0.025731,"[[107420, 13], [1666, 44]]"
3,LogisticRegression,Test,Without Binary Vars,0.984241,0.833333,0.022831,"[[26846, 2], [428, 10]]"
4,KNeighborsClassifier,Train,N=3,0.989051,0.785161,0.41462,"[[107239, 194], [1001, 709]]"
5,KNeighborsClassifier,Test,N=3,0.983984,0.502646,0.216895,"[[26754, 94], [343, 95]]"
6,KNeighborsClassifier,Train,N=5,0.988062,0.786217,0.326901,"[[107281, 152], [1151, 559]]"
7,KNeighborsClassifier,Test,N=5,0.984717,0.573427,0.187215,"[[26787, 61], [356, 82]]"
8,KNeighborsClassifier,Train,N=11,0.986147,0.733491,0.181871,"[[107320, 113], [1399, 311]]"
9,KNeighborsClassifier,Test,N=11,0.985084,0.659794,0.146119,"[[26815, 33], [374, 64]]"
10,KNeighborsClassifier,Train,N=13,0.986064,0.756098,0.163158,"[[107343, 90], [1431, 279]]"
11,KNeighborsClassifier,Test,N=13,0.985304,0.698925,0.148402,"[[26820, 28], [373, 65]]"


# 11. Top preforming models

In [41]:
best_df = results_df[results_df["Train/Test"]== "Test"]
best_df = best_df.sort_values(by="Recall", ascending=False)
best_df = best_df.head(3).reset_index(drop=True)
best_df

Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
0,DecisionTreeClassifier,Test,depth=None,0.978121,0.33049,0.353881,"[[26534, 314], [283, 155]]"
1,DecisionTreeClassifier,Test,depth=11,0.984937,0.548043,0.351598,"[[26721, 127], [284, 154]]"
2,DecisionTreeClassifier,Test,depth=5,0.985524,0.613757,0.26484,"[[26775, 73], [322, 116]]"


# 12 Removing the Most Significant Coefficient and Reapplying the Top-Performing Model

In [42]:
import numpy as np


print("coef : " , logreg_8.coef_)

coefficients = pd.DataFrame({
    'Feature': x1.columns,
    'Coefficient': np.abs(logreg_8.coef_[0])
})

coefficients = coefficients.sort_values(by= 'Coefficient' , ascending=False).reset_index(drop=True)
most_import = coefficients.Feature[0]

df_12 = df_8.drop(most_import, axis=1)

x2 = df_12.drop(['Machine failure'], axis =1)
y2 = df_12['Machine failure']

xtrain12, xtest12, ytrain12, ytest12 = train_test_split(x2, y2, test_size=0.2, random_state=42)

# best models

results_of_the_best_models = []

num1 = DecisionTreeClassifier(max_depth=None)
num2 = DecisionTreeClassifier(max_depth=11)
num3 = DecisionTreeClassifier(max_depth=5)


for model in {num1, num2, num3}:
      evaluate_model(results_of_the_best_models, model, xtrain12, ytrain12, xtest12, ytest12, f"depth={model.max_depth}")

evaluate_model(results_of_the_best_models, logreg_8, xtrain12, ytrain12, xtest12, ytest12, "Without Binary Vars")


print("Before removing the most importent parameter : ")
display(best_df)


print("After removing the most importent parameter : ")
results_of_the_best_models_df = pd.DataFrame(results_of_the_best_models, columns=["Model", "Train/Test", "Configuration", "Accuracy", "Precision", "Recall","Confusion Matrix"])


df_filtered = results_of_the_best_models_df[results_of_the_best_models_df["Train/Test"] == "Test"].sort_values(by="Recall", ascending=False).reset_index(drop=True)
display(df_filtered)



coef :  [[-0.21341729  5.65686931 -4.23340376  4.87281891 11.28798545  1.94640648]]
Before removing the most importent parameter : 


Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
0,DecisionTreeClassifier,Test,depth=None,0.978121,0.33049,0.353881,"[[26534, 314], [283, 155]]"
1,DecisionTreeClassifier,Test,depth=11,0.984937,0.548043,0.351598,"[[26721, 127], [284, 154]]"
2,DecisionTreeClassifier,Test,depth=5,0.985524,0.613757,0.26484,"[[26775, 73], [322, 116]]"


After removing the most importent parameter : 


Unnamed: 0,Model,Train/Test,Configuration,Accuracy,Precision,Recall,Confusion Matrix
0,DecisionTreeClassifier,Test,depth=None,0.976691,0.290254,0.312785,"[[26513, 335], [301, 137]]"
1,DecisionTreeClassifier,Test,depth=11,0.983948,0.5,0.262557,"[[26733, 115], [323, 115]]"
2,DecisionTreeClassifier,Test,depth=5,0.984534,0.542105,0.23516,"[[26761, 87], [335, 103]]"
3,LogisticRegression,Test,Without Binary Vars,0.983948,0.0,0.0,"[[26848, 0], [438, 0]]"
