In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv("equipment_failure_data_1.csv", header=0)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149855 entries, 0 to 149854
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ID                  149855 non-null  int64  
 1   DATE                149855 non-null  object 
 2   REGION_CLUSTER      149855 non-null  object 
 3   MAINTENANCE_VENDOR  149855 non-null  object 
 4   MANUFACTURER        149855 non-null  object 
 5   WELL_GROUP          149855 non-null  int64  
 6   S15                 149855 non-null  float64
 7   S17                 149855 non-null  float64
 8   S13                 149855 non-null  float64
 9   S5                  149855 non-null  float64
 10  S16                 149855 non-null  float64
 11  S19                 149855 non-null  float64
 12  S18                 149855 non-null  float64
 13  EQUIPMENT_FAILURE   149855 non-null  int64  
 14  S8                  149855 non-null  float64
 15  AGE_OF_EQUIPMENT    149855 non-nul

In [4]:
try:
    data = data.to_numpy()
except:
    pass
X = data[:,6:13].astype('float') 
y = data[:,13].astype('int') 

print(X.shape)
print(y[0])

(149855, 7)
0


In [5]:
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [7]:
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc.fit(X_train_scaled,y_train)
y_pred = svc.predict(X_val_scaled)

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
print(tn, fp, fn, tp)

0.9986653319986654
[[14965     0]
 [   20     0]]
14965 0 20 0


In [17]:
from sklearn.utils import resample
X_oversampled, y_oversampled = resample(X_train_scaled[y_train == 1],
                                        y_train[y_train == 1],
                                        replace=True,
                                        n_samples=X_train_scaled[y_train == 0].shape[0])
X_balanced = np.vstack((X_train[y_train == 0], X_oversampled))
y_balanced = np.hstack((y_train[y_train == 0], y_oversampled))
svc = SVC(gamma='auto')
svc.fit(X_balanced,y_balanced)
y_pred = svc.predict(X_val_scaled)


In [18]:
print(accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
print(tn, fp, fn, tp)

0.8999666332999666
[[13467  1498]
 [    1    19]]
13467 1498 1 19


In [23]:
print(X_balanced.shape)
print(y_balanced.shape)
print(np.unique(y_balanced))

(239438, 7)
(239438,)
[0 1]
