In [1]:
#Import

import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('../Data/final.csv')

### Oversampling the data with smote:

In [3]:
# https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#smote-variants

In [4]:
#Import
from imblearn.over_sampling import BorderlineSMOTE

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
#First we need to create X and y:
X = df.drop(['SN','status'], axis=1)
y = df.status

In [6]:
X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

In [7]:
len(X_resampled)

4628

In [8]:
len(y_resampled)

4628

In [9]:
unique_elements, counts_elements = np.unique(y_resampled, return_counts=True)
print("Frequency of PASS/FAIL values:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of PASS/FAIL values:
[['F' 'P']
 [2314 2314]]


In [10]:
#We create a new dataframe with the resampled data:
df_oversample = pd.DataFrame(data=X_resampled, columns=X.columns)

In [11]:
#Adding the Y_resampled:
df_oversample['status'] = y_resampled

### Separating train set into X and y

In [12]:
X_over = df_oversample.drop('status', axis=1)
y_over = df_oversample.status

In [13]:
df_oversample.isnull().sum()

A_Indep_Front             0
A_RollSpeed_Front         0
A_R2R2W_Front             0
KM_Front                  0
KM_DEV_Front              0
A_Indep_Rear              0
A_RollSpeed_Rear          0
A_R2R2W_Rear              0
KM_Rear                   0
KM_DEV_Rear               0
TR_PWM_SLOP_FRONT         0
TR_PWM_OFFSET_FRONT       0
R2_Front                  0
TR_AccuENC_RATIO_FRONT    0
TR_PWM_SLOP_REAR          0
TR_PWM_OFFSET_REAR        0
R2_Rear                   0
TR_AccuENC_RATIO_REAR     0
status                    0
dtype: int64

In [14]:
print(len(X_over))
print(len(y_over))

4628
4628


### Splitting into train and test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

### Using KNN

In [17]:
transformer = Normalizer().fit(X_train)
norm_X = transformer.transform(X_train)

In [18]:
transformer = Normalizer().fit(X_test)
norm_X_test = transformer.transform(X_test)

In [19]:
KNN = KNeighborsClassifier()

In [58]:
param_grid = {'n_neighbors': [5,7,9,11,15],
              'weights': ['uniform'], #We only use uniform because 'distance' overfit a lot.
              'leaf_size': [10, 20, 30, 50]}

In [59]:
GS = GridSearchCV(KNN, param_grid, cv=5, verbose=3)

In [60]:
GS.fit(norm_X, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] leaf_size=20, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=5, weights=uniform, score=0.765, total=   0.0s
[CV] leaf_size=20, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=5, weights=uniform, score=0.722, total=   0.0s
[CV] leaf_size=20, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=5, weights=uniform, score=0.753, total=   0.0s
[CV] leaf_size=20, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=5, weights=uniform, score=0.772, total=   0.0s
[CV] leaf_size=20, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=5, weights=uniform, score=0.739, total=   0.0s
[CV] leaf_size=20, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=7, weights=uniform, score=0.744, total=   0.0s
[CV] leaf_size=20, n_neighbors=7, weigh

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  leaf_size=20, n_neighbors=7, weights=uniform, score=0.717, total=   0.0s
[CV] leaf_size=20, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=7, weights=uniform, score=0.743, total=   0.0s
[CV] leaf_size=20, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=7, weights=uniform, score=0.759, total=   0.0s
[CV] leaf_size=20, n_neighbors=7, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=7, weights=uniform, score=0.728, total=   0.0s
[CV] leaf_size=20, n_neighbors=9, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=9, weights=uniform, score=0.738, total=   0.0s
[CV] leaf_size=20, n_neighbors=9, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=9, weights=uniform, score=0.713, total=   0.0s
[CV] leaf_size=20, n_neighbors=9, weights=uniform ....................
[CV]  leaf_size=20, n_neighbors=9, weights=uniform, score=0.719, total=   0.0s
[CV] leaf_size=20, n_

[CV]  leaf_size=50, n_neighbors=9, weights=uniform, score=0.719, total=   0.0s
[CV] leaf_size=50, n_neighbors=9, weights=uniform ....................
[CV]  leaf_size=50, n_neighbors=9, weights=uniform, score=0.749, total=   0.0s
[CV] leaf_size=50, n_neighbors=9, weights=uniform ....................
[CV]  leaf_size=50, n_neighbors=9, weights=uniform, score=0.724, total=   0.0s
[CV] leaf_size=50, n_neighbors=11, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=11, weights=uniform, score=0.725, total=   0.0s
[CV] leaf_size=50, n_neighbors=11, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=11, weights=uniform, score=0.707, total=   0.0s
[CV] leaf_size=50, n_neighbors=11, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=11, weights=uniform, score=0.715, total=   0.0s
[CV] leaf_size=50, n_neighbors=11, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=11, weights=uniform, score=0.743, total=   0.0s
[CV] leaf_size=50

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    2.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=100,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=21, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'leaf_size': [20, 30, 50],
                         'n_neighbors': [5, 7, 9, 11, 15],
                         'weights': ['uniform']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [61]:
#We check the best params of the search grid:
GS.best_params_

{'leaf_size': 20, 'n_neighbors': 5, 'weights': 'uniform'}

In [82]:
#Run the model with the selected parameters:
KNN = KNeighborsClassifier(leaf_size=100, n_neighbors=9, weights='uniform')

In [83]:
KNN.fit(norm_X, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=100, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [84]:
#Prediction of the test dataset:
y_train_pred = KNN.predict(norm_X)
y_pred = KNN.predict(norm_X_test)

In [85]:
#We check scores with train:
f1 = f1_score(y_train, y_train_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_train, y_train_pred)
conf = confusion_matrix(y_train, y_train_pred)

print ('TRAIN MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TRAIN MODEL METRICS:
The F1 score is: 0.8000239329124325
The accuracy is: 0.8014586709886548
Confusion matrix:


array([[1650,  220],
       [ 515, 1317]])

In [86]:
#We check scores with test:
f1 = f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TEST MODEL METRICS:
The F1 score is: 0.7119062929635427
The accuracy is: 0.7159827213822895
Confusion matrix:


array([[378,  66],
       [197, 285]])

### Check the metrics with only the selected data:

In [87]:
y_pred_real = KNN.predict(X)

In [88]:
#We check scores:
f1 = f1_score(y, y_pred_real, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y, y_pred_real)
conf = confusion_matrix(y, y_pred_real)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TEST MODEL METRICS:
The F1 score is: 0.7490140448768295
The accuracy is: 0.7023988005997002
Confusion matrix:


array([[ 272,   82],
       [ 712, 1602]])