In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [17]:
df_test.drop('id', axis=1, inplace=True)
df_train.drop('id', axis=1, inplace=True)
df_test.drop('Product ID', axis=1, inplace=True)
df_train.drop('Product ID', axis=1, inplace=True)

In [18]:
df_train['Type'].replace(to_replace='L', value=0, inplace=True)
df_train['Type'].replace(to_replace='M', value=1, inplace=True)
df_train['Type'].replace(to_replace='H', value=2, inplace=True)
df_test['Type'].replace(to_replace='L', value=0, inplace=True)
df_test['Type'].replace(to_replace='M', value=1, inplace=True)
df_test['Type'].replace(to_replace='H', value=2, inplace=True)

In [19]:
df_train

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,0,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,0,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,1,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
136424,1,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,2,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,0,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,0,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


In [23]:
X_train = df_train.drop('Machine failure', axis=1)
y_train = df_train['Machine failure']

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [27]:
model = RandomForestClassifier()
scaler = StandardScaler()

In [28]:
pipe = Pipeline([('scaler', scaler), ('rfc', model)])

In [29]:
parameters = {
    'rfc__n_estimators' : list(range(70,74)),
    'rfc__criterion' : ["gini", "entropy", "log_loss"],
    'rfc__max_depth' : [2,3,4],
    'rfc__max_features' : ["sqrt", "log2", None],
}

In [30]:
full_model = GridSearchCV(pipe, param_grid=parameters, cv=3, verbose=2, scoring='accuracy')

In [31]:
full_model.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=70; total time=   1.8s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=70; total time=   2.0s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=70; total time=   2.1s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=71; total time=   2.1s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=71; total time=   2.1s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=71; total time=   2.1s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=72; total time=   2.0s
[CV] END rfc__criterion=gini, rfc__max_depth=2, rfc__max_features=sqrt, rfc__n_estimators=72; total time=   2.1s
[CV] END rfc__criterion=gini, rfc

In [32]:
full_model.best_params_

{'rfc__criterion': 'entropy',
 'rfc__max_depth': 4,
 'rfc__max_features': None,
 'rfc__n_estimators': 71}

In [33]:
y_pred = full_model.predict(X_test)

In [34]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     40451
           1       0.75      1.00      0.86       478

    accuracy                           1.00     40929
   macro avg       0.88      1.00      0.93     40929
weighted avg       1.00      1.00      1.00     40929



In [36]:
df_test

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,0,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,0,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,0,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,1,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,1,303.4,312.3,1515,41.3,114,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
90949,0,302.3,311.4,1484,40.4,15,0,0,0,0,0
90950,0,297.9,309.8,1542,33.8,31,0,0,0,0,0
90951,0,295.6,306.2,1501,41.4,187,0,0,0,0,0
90952,0,298.1,307.8,1534,40.3,69,0,0,0,0,0


In [37]:
X_test = df_test

In [38]:
y_pred = full_model.predict(X_test)

In [39]:
sub = pd.read_csv('sample_submission.csv')

In [40]:
sub

Unnamed: 0,id,Machine failure
0,136429,0.5
1,136430,0.5
2,136431,0.5
3,136432,0.5
4,136433,0.5
...,...,...
90949,227378,0.5
90950,227379,0.5
90951,227380,0.5
90952,227381,0.5


In [41]:
sub['Machine failure'] = y_pred

In [42]:
sub

Unnamed: 0,id,Machine failure
0,136429,0
1,136430,0
2,136431,0
3,136432,0
4,136433,0
...,...,...
90949,227378,0
90950,227379,0
90951,227380,0
90952,227381,0


In [44]:
sub.to_csv('sub.csv', index_label=False, index=False)

In [45]:
import pickle

In [46]:
with open('mymodel.pkl', 'wb') as file:
    pickle.dump(full_model, file)

In [47]:
with open('mymodel.pkl', 'rb') as file:
    model = pickle.load(file)

In [48]:
model