# Loading data

In [25]:
# Importing relevant packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [26]:
# Loading the data
df = pd.read_pickle("data.pkl")

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149855 entries, 0 to 149854
Data columns (total 86 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   index              149855 non-null  int64  
 1   ID                 149855 non-null  int64  
 2   S15                149855 non-null  float64
 3   S17                149855 non-null  float64
 4   S13                149855 non-null  float64
 5   S5                 149855 non-null  float64
 6   S16                149855 non-null  float64
 7   S19                149855 non-null  float64
 8   S18                149855 non-null  float64
 9   EQUIPMENT_FAILURE  149855 non-null  int64  
 10  S8                 149855 non-null  float64
 11  AGE_OF_EQUIPMENT   149855 non-null  int64  
 12  TIME_SINCE_START   149855 non-null  float64
 13  too_soon           149855 non-null  int32  
 14  S15_mean           149855 non-null  float64
 15  S17_mean           149855 non-null  float64
 16  S1

In [4]:
X = df[[x for x in df.columns if x not in ['FAILURE_TARGET','EQUIPMENT_FAILURE', 'TIME_TO_FAILURE']]]
y = df[['FAILURE_TARGET']]

In [5]:
try:
    X = X.to_numpy()
    y = y.to_numpy().ravel()
except:
    pass


In [6]:
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)
#Now since we want the valid and test size to be equal (10% each of overall data).
test_size = 0.5
X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [7]:
# Normalizing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
def test(y_true,y_pred):
    print(accuracy_score(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print('tn',tn, 'fp',fp, 'fn',fn, 'tp',tp)
    print('f1', f1_score(y_true, y_pred))


# Sampling Data

In [10]:
from sklearn.utils import resample
X_oversampled, y_oversampled = resample(X_train_scaled[y_train == 1],
                                        y_train[y_train == 1],
                                        replace=True,
                                        n_samples=X_train_scaled[y_train == 0].shape[0])
X_balanced = np.vstack((X_train_scaled[y_train == 0], X_oversampled))
y_balanced = np.hstack((y_train[y_train == 0], y_oversampled))



# Training Random Forest Classifier

In [12]:
# Running RFC
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_balanced,y_balanced)
y_pred_rfc = rfc.predict(X_val_scaled)
test(y_val,y_pred_rfc)

0.9961294627961295
[[14460    13]
 [   45   467]]
tn 14460 fp 13 fn 45 tp 467
f1 0.9415322580645161
