In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from evidently import ColumnMapping
from evidently.test_suite import TestSuite
from evidently.test_preset import DataStabilityTestPreset

from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, DataQualityPreset, ClassificationPreset
import warnings
warnings.filterwarnings('ignore')
print("Hello world")

Hello world


In [2]:
ref_data = pd.read_csv("../data/processed/preprocessed.csv")
cur_data = pd.read_csv("../data/processed/test.csv")

In [3]:
ref_data['Machine failure'].value_counts()

Machine failure
1    38617
0    19295
Name: count, dtype: int64

In [4]:
ref_X = ref_data.drop(["Machine failure", "type_of_failure"], axis=1)
ref_y = ref_data["Machine failure"]

cur_X = cur_data.drop(["Machine failure", "type_of_failure"], axis=1)
cur_y = cur_data["Machine failure"]

In [5]:
ref_X_train, ref_X_test, ref_y_train, ref_y_test = train_test_split(ref_X, ref_y, test_size=0.2, random_state=42)

In [6]:
ref_y_test

16971    1
53134    1
46497    0
9584     0
1566     0
        ..
32887    1
35872    1
4623     0
6231     0
40380    0
Name: Machine failure, Length: 11583, dtype: int64

In [7]:
cur_X

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c]
0,0.0,0.131497,0.747326,0.749753,0.621506,0.653219
1,0.0,0.254873,0.386470,0.870363,0.554348,0.664579
2,0.0,0.159291,0.550137,0.823704,0.710039,0.646151
3,2.0,0.175786,0.542582,0.241107,0.315217,0.382716
4,0.0,0.201615,0.447002,0.848406,0.873713,0.761418
...,...,...,...,...,...,...
11578,0.0,0.840343,0.112007,0.431981,0.308726,0.286695
11579,0.0,0.411326,0.267356,0.888804,0.465331,0.433503
11580,1.0,0.412889,0.268972,0.874461,0.467391,0.466927
11581,0.0,0.296275,0.310440,0.577075,0.206522,0.333333


In [8]:
ref_y.value_counts()

Machine failure
1    38617
0    19295
Name: count, dtype: int64

In [9]:
rf = RandomForestClassifier()
rf.fit(ref_X_train.values, ref_y_train.values)



## Using predict

In [10]:
ref_pred = rf.predict(ref_X_test)
ref_pred = pd.DataFrame(ref_pred, columns=["Prediction"])
cur_pred = rf.predict(cur_X)
cur_pred = pd.DataFrame(cur_pred, columns=["Prediction"])


In [11]:
ref_X_test.reset_index(inplace=True, drop=True)
ref_y_test.reset_index(inplace=True, drop=True)
ref_merged = pd.concat([ref_X_test, ref_y_test], axis=1)
ref_merged = pd.concat([ref_merged, ref_pred], axis=1)
ref_merged

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],Machine failure,Prediction
0,0.0,0.108726,0.607490,0.640992,0.752546,0.613613,1,1
1,2.0,0.122930,0.613913,0.964756,0.576635,0.461772,1,1
2,0.0,0.195818,0.573143,0.572728,0.364513,0.510378,0,0
3,0.0,0.210128,0.515110,0.067194,0.434783,0.592593,0,0
4,1.0,0.136787,0.563187,0.584980,0.293478,0.296296,0,0
...,...,...,...,...,...,...,...,...
11578,1.0,0.819950,0.122083,0.334225,0.568599,0.537018,1,1
11579,0.0,0.095867,0.851640,0.000000,0.682132,0.629166,1,1
11580,0.0,0.207800,0.506868,0.707510,0.836957,0.666667,0,0
11581,1.0,0.190338,0.508242,0.584980,0.652174,0.654321,0,0


In [12]:
cur_X.reset_index(inplace=True, drop=True)
cur_y.reset_index(inplace=True, drop=True)
cur_merged = pd.concat([cur_X, cur_y], axis=1)
cur_merged = pd.concat([cur_merged, cur_pred], axis=1)
cur_merged

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],Machine failure,Prediction
0,0.0,0.131497,0.747326,0.749753,0.621506,0.653219,1,1
1,0.0,0.254873,0.386470,0.870363,0.554348,0.664579,1,1
2,0.0,0.159291,0.550137,0.823704,0.710039,0.646151,1,1
3,2.0,0.175786,0.542582,0.241107,0.315217,0.382716,0,0
4,0.0,0.201615,0.447002,0.848406,0.873713,0.761418,1,1
...,...,...,...,...,...,...,...,...
11578,0.0,0.840343,0.112007,0.431981,0.308726,0.286695,1,1
11579,0.0,0.411326,0.267356,0.888804,0.465331,0.433503,1,1
11580,1.0,0.412889,0.268972,0.874461,0.467391,0.466927,1,1
11581,0.0,0.296275,0.310440,0.577075,0.206522,0.333333,0,0


In [13]:
cm = ColumnMapping()
cm.target = "Machine failure"
cm.prediction = "Prediction"
cm.target_names = ["No Failure", "Machine Failure"]

In [14]:
classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])
classification_performance_report.run(reference_data=ref_merged, current_data=cur_merged, column_mapping = cm)
classification_performance_report.save_html("../reports/classification_performance_report.html")

## Using Probas

In [15]:
ref_probas = pd.DataFrame(rf.predict_proba(ref_X_test))
ref_probas.columns = ['No Failure', 'Machine Failure']
cur_probas = pd.DataFrame(rf.predict_proba(cur_X))
cur_probas.columns = ['No Failure', 'Machine Failure']

In [None]:
ref_probas = pd.DataFrame(rf.predict_proba(ref_X_test))
ref_probas.columns = ['No Failure', 'Machine Failure']
cur_probas = pd.DataFrame(rf.predict_proba(cur_X))
cur_probas.columns = ['No Failure', 'Machine Failure']

In [16]:
ref_probas

Unnamed: 0,No Failure,Machine Failure
0,0.00,1.00
1,0.05,0.95
2,1.00,0.00
3,1.00,0.00
4,1.00,0.00
...,...,...
11578,0.00,1.00
11579,0.01,0.99
11580,0.99,0.01
11581,1.00,0.00


In [17]:
ref_probas.value_counts()

No Failure  Machine Failure
0.00        1.00               5995
1.00        0.00               3064
0.01        0.99                937
0.99        0.01                359
0.02        0.98                322
                               ... 
0.68        0.32                  1
0.46        0.54                  1
0.55        0.45                  1
0.48        0.52                  1
0.67        0.33                  1
Name: count, Length: 99, dtype: int64

In [18]:
ref_X_test.reset_index(inplace=True, drop=True)
ref_X_test['Attrition'] = ['No Failure' if x == 0 else 'Machine Failure' for x in ref_y_test]
ref_merged = pd.concat([ref_X_test, ref_probas], axis = 1)

cur_X.reset_index(inplace=True, drop=True)
cur_X['Attrition'] = ['No Failure' if x == 0 else 'Machine Failure' for x in cur_y]
cur_merged = pd.concat([cur_X, cur_probas], axis = 1)

In [19]:
ref_merged

Unnamed: 0,Type,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],Attrition,No Failure,Machine Failure
0,0.0,0.108726,0.607490,0.640992,0.752546,0.613613,Machine Failure,0.00,1.00
1,2.0,0.122930,0.613913,0.964756,0.576635,0.461772,Machine Failure,0.05,0.95
2,0.0,0.195818,0.573143,0.572728,0.364513,0.510378,No Failure,1.00,0.00
3,0.0,0.210128,0.515110,0.067194,0.434783,0.592593,No Failure,1.00,0.00
4,1.0,0.136787,0.563187,0.584980,0.293478,0.296296,No Failure,1.00,0.00
...,...,...,...,...,...,...,...,...,...
11578,1.0,0.819950,0.122083,0.334225,0.568599,0.537018,Machine Failure,0.00,1.00
11579,0.0,0.095867,0.851640,0.000000,0.682132,0.629166,Machine Failure,0.01,0.99
11580,0.0,0.207800,0.506868,0.707510,0.836957,0.666667,No Failure,0.99,0.01
11581,1.0,0.190338,0.508242,0.584980,0.652174,0.654321,No Failure,1.00,0.00


In [20]:
cm  = ColumnMapping()
cm.target = 'Attrition'
cm.prediction = ['No Failure', 'Machine Failure']
cm.pos_label = 'No Failure'

In [21]:
classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])

classification_performance_report.run(reference_data=ref_merged, current_data=cur_merged, column_mapping = cm)
classification_performance_report.save_html("../reports/classification_performance_report.html")