In [1]:
#Import

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('../Data/final.csv')

### Oversampling the data with smote:

In [3]:
# https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#smote-variants

In [4]:
#Import
from imblearn.over_sampling import BorderlineSMOTE

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
#First we need to create X and y:
X = df.drop(['SN','status'], axis=1)
y = df.status

In [6]:
X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

In [7]:
len(X_resampled)

4628

In [8]:
len(y_resampled)

4628

In [9]:
unique_elements, counts_elements = np.unique(y_resampled, return_counts=True)
print("Frequency of PASS/FAIL values:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of PASS/FAIL values:
[['F' 'P']
 [2314 2314]]


In [10]:
#We create a new dataframe with the resampled data:
df_oversample = pd.DataFrame(data=X_resampled, columns=X.columns)

In [11]:
#Adding the Y_resampled:
df_oversample['status'] = y_resampled

In [12]:
df_oversample.head()

Unnamed: 0,A_Indep_Front,A_RollSpeed_Front,A_R2R2W_Front,KM_Front,KM_DEV_Front,A_Indep_Rear,A_RollSpeed_Rear,A_R2R2W_Rear,KM_Rear,KM_DEV_Rear,TR_PWM_SLOP_FRONT,TR_PWM_OFFSET_FRONT,R2_Front,TR_AccuENC_RATIO_FRONT,TR_PWM_SLOP_REAR,TR_PWM_OFFSET_REAR,R2_Rear,TR_AccuENC_RATIO_REAR,status
0,0.390513,0.000601,128.379043,0.04195,0.037106,0.468861,0.000664,89.051406,0.042561,0.053677,1.370314,-37.246261,0.999998,9.075283,1.372229,-45.169482,0.999979,9.176037,P
1,0.348259,0.000509,129.046143,0.041847,0.052308,0.338558,0.000449,86.57981,0.041813,0.041948,1.374144,-28.468581,0.999986,9.075158,1.370026,-40.57786,0.999983,9.174968,P
2,0.407983,0.00062,53.861514,0.041439,0.031839,0.447917,0.001068,72.697784,0.041715,0.033923,1.335473,-38.879725,0.999897,9.076344,1.3771,-38.177445,0.999997,9.179953,P
3,0.386216,0.000631,76.627232,0.041407,0.034478,0.3844,0.000903,79.220271,0.042135,0.032906,1.331818,-32.805212,0.999993,9.075598,1.379523,-58.649253,0.999995,9.156949,P
4,0.373776,0.000921,94.511528,0.041983,0.037544,0.451047,0.000969,81.454951,0.041807,0.01589,1.337583,-35.964199,0.999978,9.075733,1.358422,-40.033938,0.999999,9.178558,P


In [13]:
df_oversample.shape

(4628, 19)

### Separating train set into X and y

In [15]:
X_over = df_oversample.drop('status', axis=1)
y_over = df_oversample.status

### Splitting into train and test

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

### Running a grid search for parameter selection for a Random Forest Classifier model

In [28]:
param_grid = {
    'n_estimators': [10,20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2, 3, 5]
}

In [29]:
RFC = RandomForestClassifier(n_jobs=-1)

In [30]:
GS = GridSearchCV(RFC, param_grid, cv=5, verbose = 3)

In [None]:
GS.fit(X_train, y_train)

In [None]:
GS.best_params_

In [21]:
RFC = RandomForestClassifier(max_depth = 5, max_features='auto', n_estimators=1000)

In [22]:
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
#Prediction of the test dataset:
y_train_pred = RFC.predict(X_train)
y_pred = RFC.predict(X_test)

In [24]:
#We check scores with train:
f1 = f1_score(y_train, y_train_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_train, y_train_pred)
conf = confusion_matrix(y_train, y_train_pred)

print ('TRAIN MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TRAIN MODEL METRICS:
The F1 score is: 0.7716767520010936
The accuracy is: 0.7722852512155591
Confusion matrix:


array([[1535,  335],
       [ 508, 1324]])

In [25]:
#We check scores with test:
f1 = f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TEST MODEL METRICS:
The F1 score is: 0.7260995417804449
The accuracy is: 0.7267818574514039
Confusion matrix:


array([[352,  92],
       [161, 321]])

### Check the metrics with only the selected data:

In [26]:
y_pred_real = RFC.predict(X)

In [27]:
#We check scores:
f1 = f1_score(y, y_pred_real, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y, y_pred_real)
conf = confusion_matrix(y, y_pred_real)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TEST MODEL METRICS:
The F1 score is: 0.7527168108714317
The accuracy is: 0.7083958020989505
Confusion matrix:


array([[ 245,  109],
       [ 669, 1645]])

### Final score: 0.78% accuracy of the model.