In [1]:
import pandas as pd
import numpy as np

### RW

In [2]:
rw = pd.read_csv('../Data/RW.csv')

In [3]:
#For this dataset we have to drop the following columns:
drop_col = ['day','time','datetime','A_R2R2W_RollSpeed_Front','A_R2R2W_RollSpeed_Rear']
rw.drop(drop_col, axis = 1, inplace = True)

### TR

In [4]:
tr = pd.read_csv('../Data/TR.csv')

In [5]:
#For this dataset we have to drop the following columns:
drop_col = ['day','time','datetime']
tr.drop(drop_col, axis = 1, inplace = True)

### BT

In [6]:
bt = pd.read_csv('../Data/BT.csv')
bt2 = pd.read_csv('../Data/BT2.csv')

In [7]:
#For this dataset we only have to keep the pass column:
bt = bt[['SN','status']]
bt2 = bt2[['SN','status']]

In [8]:
#We concat both dataframes and check duplicates:

bt_full = pd.concat([bt, bt2])

In [9]:
bt_full.drop_duplicates(keep=False,inplace=True) 

In [10]:
bt_full.shape

(13871, 2)

In [11]:
bt_full.head(20)

Unnamed: 0,SN,status
0,DRW301500191,F
1,DRW301500161,P
2,DRW301500171,P
3,DRW301500221,P
4,DRW301500231,P
5,DRW301500251,P
6,DRW301500261,P
7,DRW301500291,P
8,DRW301500301,P
9,DRW301500311,P


In [12]:
#There are 30 SN with status P and F. We delete de F ones.
f_bt = list(bt_full[bt_full['status']=='F'].SN.unique())
p_sn = list(bt_full[bt_full['status']=='P'].SN.unique())
drop_sn = list(set(f_bt).intersection(p_sn))

#Now, we drop these rows:
bt_1 = bt_full.loc[~bt_full['SN'].isin(drop_sn)]
bt_2 = bt_full.loc[bt_full['SN'].isin(drop_sn)]
bt_2 = bt_2[bt_2['status']=='P']

bt_full = pd.concat([bt_1,bt_2])

In [13]:
len(bt_full.SN.unique())

13841

### Merge all dataframes together:

In [14]:
#SN list:

rw_sn = list(rw.SN.unique())
tr_sn = list(tr.SN.unique())
bt_sn = list(bt_full.SN.unique())
print(len(rw_sn))
print(len(tr_sn))
print(len(bt_sn))

3995
3996
13841


In [15]:
#We need to keep the SN common in all lists:
final_sn = list(set(rw_sn).intersection(bt_sn))
final_sn = list(set(final_sn).intersection(tr_sn))

In [16]:
#The final numer of data is:
print(len(final_sn))

2668


In [17]:
#We filter all dataframes by SN and merge them:
rw_final = rw.loc[rw['SN'].isin(final_sn)]
tr_final = tr.loc[tr['SN'].isin(final_sn)]
bt_final = bt_full.loc[bt_full['SN'].isin(final_sn)]

#Merge:
model_bt = pd.merge(rw_final, tr_final, on='SN')
model_bt = pd.merge(model_bt, bt_final, on='SN')

In [18]:
model_bt.shape

(2668, 20)

In [19]:
model_bt.head()

Unnamed: 0,SN,A_Indep_Front,A_RollSpeed_Front,A_R2R2W_Front,KM_Front,KM_DEV_Front,A_Indep_Rear,A_RollSpeed_Rear,A_R2R2W_Rear,KM_Rear,KM_DEV_Rear,TR_PWM_SLOP_FRONT,TR_PWM_OFFSET_FRONT,R2_Front,TR_AccuENC_RATIO_FRONT,TR_PWM_SLOP_REAR,TR_PWM_OFFSET_REAR,R2_Rear,TR_AccuENC_RATIO_REAR,status
0,DRW091810461,0.390513,0.000601,128.379043,0.04195,0.037106,0.468861,0.000664,89.051406,0.042561,0.053677,1.370314,-37.246261,0.999998,9.075283,1.372229,-45.169482,0.999979,9.176037,P
1,DRW091810471,0.348259,0.000509,129.046143,0.041847,0.052308,0.338558,0.000449,86.57981,0.041813,0.041948,1.374144,-28.468581,0.999986,9.075158,1.370026,-40.57786,0.999983,9.174968,P
2,DRW391710071,0.407983,0.00062,53.861514,0.041439,0.031839,0.447917,0.001068,72.697784,0.041715,0.033923,1.335473,-38.879725,0.999897,9.076344,1.3771,-38.177445,0.999997,9.179953,P
3,DRW391700001,0.386216,0.000631,76.627232,0.041407,0.034478,0.3844,0.000903,79.220271,0.042135,0.032906,1.331818,-32.805212,0.999993,9.075598,1.379523,-58.649253,0.999995,9.156949,P
4,DRW391710021,0.373776,0.000921,94.511528,0.041983,0.037544,0.451047,0.000969,81.454951,0.041807,0.01589,1.337583,-35.964199,0.999978,9.075733,1.358422,-40.033938,0.999999,9.178558,P


In [20]:
model_bt.to_csv('../Data/final.csv', index=False)

### Target variable EDA

In [21]:
#We check the number of each "Status" (Pass or Fail)
model_bt.status.value_counts()

P    2314
F     354
Name: status, dtype: int64

In [22]:
P = 2314/(2314+354)
P

0.8673163418290855

In [23]:
F = 354/(2314+354)
F

0.13268365817091454

In [24]:
#There is a 86,7% of PASS results and a 13.3 FAIL results. We need to balance the classes.

### Oversampling with SMOTE

In [26]:
# https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#smote-variants

In [25]:
#Import
from imblearn.over_sampling import SMOTENC

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [29]:
#Import
from imblearn.over_sampling import BorderlineSMOTE

In [31]:
#First we need to create X and y:
X = model_bt.drop(['SN','status'], axis=1)
y = model_bt.status

In [32]:
X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

In [41]:
len(X_resampled)

4628

In [33]:
len(y_resampled)

4628

In [40]:
unique_elements, counts_elements = np.unique(y_resampled, return_counts=True)
print("Frequency of PASS/FAIL values:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of PASS/FAIL values:
[['F' 'P']
 [2314 2314]]


In [52]:
#We create a new dataframe with the resampled data:
model_bt_oversample = pd.DataFrame(data=X_resampled, columns=X.columns)

In [54]:
#Adding the Y_resampled:
model_bt_oversample['status'] = y_resampled

In [55]:
model_bt_oversample.head()

Unnamed: 0,A_Indep_Front,A_RollSpeed_Front,A_R2R2W_Front,KM_Front,KM_DEV_Front,A_Indep_Rear,A_RollSpeed_Rear,A_R2R2W_Rear,KM_Rear,KM_DEV_Rear,TR_PWM_SLOP_FRONT,TR_PWM_OFFSET_FRONT,R2_Front,TR_AccuENC_RATIO_FRONT,TR_PWM_SLOP_REAR,TR_PWM_OFFSET_REAR,R2_Rear,TR_AccuENC_RATIO_REAR,status
0,0.390513,0.000601,128.379043,0.04195,0.037106,0.468861,0.000664,89.051406,0.042561,0.053677,1.370314,-37.246261,0.999998,9.075283,1.372229,-45.169482,0.999979,9.176037,P
1,0.348259,0.000509,129.046143,0.041847,0.052308,0.338558,0.000449,86.57981,0.041813,0.041948,1.374144,-28.468581,0.999986,9.075158,1.370026,-40.57786,0.999983,9.174968,P
2,0.407983,0.00062,53.861514,0.041439,0.031839,0.447917,0.001068,72.697784,0.041715,0.033923,1.335473,-38.879725,0.999897,9.076344,1.3771,-38.177445,0.999997,9.179953,P
3,0.386216,0.000631,76.627232,0.041407,0.034478,0.3844,0.000903,79.220271,0.042135,0.032906,1.331818,-32.805212,0.999993,9.075598,1.379523,-58.649253,0.999995,9.156949,P
4,0.373776,0.000921,94.511528,0.041983,0.037544,0.451047,0.000969,81.454951,0.041807,0.01589,1.337583,-35.964199,0.999978,9.075733,1.358422,-40.033938,0.999999,9.178558,P


In [56]:
model_bt_oversample.shape

(4628, 19)

In [57]:
model_bt_oversample.to_csv('../Data/final_oversample.csv', index=False)