In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_pickle("data.pkl")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149855 entries, 0 to 149854
Data columns (total 86 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   index              149855 non-null  int64  
 1   ID                 149855 non-null  int64  
 2   S15                149855 non-null  float64
 3   S17                149855 non-null  float64
 4   S13                149855 non-null  float64
 5   S5                 149855 non-null  float64
 6   S16                149855 non-null  float64
 7   S19                149855 non-null  float64
 8   S18                149855 non-null  float64
 9   EQUIPMENT_FAILURE  149855 non-null  int64  
 10  S8                 149855 non-null  float64
 11  AGE_OF_EQUIPMENT   149855 non-null  int64  
 12  TIME_SINCE_START   149855 non-null  float64
 13  too_soon           149855 non-null  int32  
 14  S15_mean           149855 non-null  float64
 15  S17_mean           149855 non-null  float64
 16  S1

In [4]:
X = df[[x for x in df.columns if x not in ['FAILURE_TARGET','EQUIPMENT_FAILURE', 'TIME_TO_FAILURE']]]
y = df[['FAILURE_TARGET']]

In [5]:
try:
    X = X.to_numpy()
    y = y.to_numpy().ravel()
except:
    pass


In [6]:
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)
#Now since we want the valid and test size to be equal (10% each of overall data).
test_size = 0.5
X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
def test(y_true,y_pred):
    print(accuracy_score(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print('tn',tn, 'fp',fp, 'fn',fn, 'tp',tp)
    print('f1', f1_score(y_true, y_pred))


In [9]:
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc.fit(X_train_scaled,y_train)
y_pred = svc.predict(X_val_scaled)
test(y_val,y_pred)

0.968968968968969
[[14428     8]
 [  457    92]]
tn 14428 fp 8 fn 457 tp 92
f1 0.2835130970724192


In [10]:
from sklearn.utils import resample
X_oversampled, y_oversampled = resample(X_train_scaled[y_train == 1],
                                        y_train[y_train == 1],
                                        replace=True,
                                        n_samples=X_train_scaled[y_train == 0].shape[0])
X_balanced = np.vstack((X_train_scaled[y_train == 0], X_oversampled))
y_balanced = np.hstack((y_train[y_train == 0], y_oversampled))



In [11]:
svc_balanced = SVC(gamma='auto')
svc_balanced.fit(X_balanced[::2],y_balanced[::2])
y_pred_balanced = svc_balanced.predict(X_val_scaled)
test(y_val,y_pred_balanced)

0.8894227560894228
[[12791  1645]
 [   12   537]]
tn 12791 fp 1645 fn 12 tp 537
f1 0.3932625411937019


In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_balanced,y_balanced)
y_pred_rfc = rfc.predict(X_val_scaled)
test(y_val,y_pred_rfc)

0.9960627293960628
[[14415    21]
 [   38   511]]
tn 14415 fp 21 fn 38 tp 511
f1 0.9454209065679926


In [13]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(40, 20))
nn.fit(X_balanced,y_balanced)
y_pred_nn = nn.predict(X_val_scaled)
test(y_val,y_pred_nn)

0.9803803803803803
[[14170   266]
 [   28   521]]
tn 14170 fp 266 fn 28 tp 521
f1 0.7799401197604791


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [14]:
y_pred_nn = nn.predict(X_test_scaled)
test(y_test,y_pred_nn)

0.9804484185239557
[[14126   263]
 [   30   567]]
tn 14126 fp 263 fn 30 tp 567
f1 0.7946741415557114


In [15]:
y_pred_rfc = rfc.predict(X_test_scaled)
test(y_test,y_pred_rfc)

0.9966635526491392
[[14378    11]
 [   39   558]]
tn 14378 fp 11 fn 39 tp 558
f1 0.9571183533447685
