## Exploring The Data

In [1]:
import pandas as pd

FILEPATH = r"archive\uci-secom.csv"
UCI_SECOM_dataset = pd.read_csv(FILEPATH)
UCI_SECOM_dataset

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.3630,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2008-10-16 15:13:00,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,...,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720,-1
1563,2008-10-16 20:49:00,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,...,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720,-1
1564,2008-10-17 05:26:00,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,,...,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231,-1
1565,2008-10-17 06:01:00,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,...,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941,-1


In [2]:
UCI_SECOM_dataset.isna().sum()

Time          0
0             6
1             7
2            14
3            14
             ..
586           1
587           1
588           1
589           1
Pass/Fail     0
Length: 592, dtype: int64

In [3]:
# Since im deleting time, no need to convert string to number
UCI_SECOM_dataset.dtypes.value_counts()

float64    590
str          1
int64        1
Name: count, dtype: int64

## Splitting The Dataset

In [4]:
# Since it's already sorted by time, we can separate it easily
# column_sum = len(UCI_SECOM_dataset.columns)

index_sum = len(UCI_SECOM_dataset)
split = round(index_sum * 0.8)
training_split = UCI_SECOM_dataset[:split]
testing_split = UCI_SECOM_dataset[split:]

## Pipeline

In [5]:
training_data = training_split.drop(columns="Pass/Fail")
training_label = training_split["Pass/Fail"]

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTranformator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        data = X.copy()
        data.drop(columns=["Time"], inplace=True)
        return data

In [7]:
# Problem 1: many missing values -> imputers
# Problem 2: Need to be standardized (data varies too large) -> std
# Problem 3: Time might cause temporal leakeage -> remove from data
# After this put PCA (later after using without to check accuracy)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipeline = Pipeline([
    ('custom', CustomTranformator()),
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('pca', PCA())
])
final_dataset = pipeline.fit_transform(training_data, training_label)

In [8]:
# PCA?
# columns = list(training_data.columns)
# columns.remove('Time')

final_dataset_dataframe = pd.DataFrame(
    final_dataset
)
final_dataset_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,-1.680284,-2.095135,3.279701,-2.059105,-1.963834,-1.550592,2.691773,-1.512271,-1.439701,-1.085717,...,3.920181e-17,-3.079081e-16,-2.954128e-16,-5.417676e-16,-3.667263e-17,6.264401e-16,-1.184468e-15,-2.891083e-18,2.191672e-16,3.916192e-17
1,-2.281144,-0.268489,2.351333,-1.555019,-2.583460,-0.565508,3.134543,-3.315182,-3.106655,-2.103316,...,4.819883e-16,1.037168e-16,-7.451324e-17,4.103900e-16,-1.891884e-16,1.014561e-16,1.056081e-16,1.603536e-16,-1.208594e-16,2.849355e-18
2,0.856315,-1.118508,1.403522,-0.892967,-0.914171,1.885987,0.109128,-3.847920,-0.802710,-2.146926,...,2.016517e-16,-2.804179e-17,-2.140786e-16,1.035086e-16,-1.185928e-16,7.686553e-17,-1.230957e-16,-3.722604e-17,-5.814886e-17,-3.694903e-17
3,1.863786,-4.533767,4.435949,-4.859907,-5.588165,-7.148467,-8.347313,-9.800727,-2.378493,-17.892664,...,-6.409624e-17,9.201048e-18,-6.583685e-17,8.210322e-17,1.933648e-17,8.226886e-17,3.213570e-17,-5.152200e-18,9.510854e-17,3.853103e-17
4,1.533506,-2.370757,3.429716,-2.443270,-1.693298,1.818304,-7.302256,-2.852151,-5.722627,2.760529,...,3.473357e-17,-1.979384e-16,2.518626e-16,-1.104934e-16,1.410225e-16,1.415521e-16,-9.227100e-17,-8.482790e-17,-1.918118e-16,-4.766382e-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249,1.559021,3.659582,-1.627361,1.186056,-1.676532,6.810497,1.146897,-5.707554,2.218482,1.228356,...,1.558347e-17,-6.546893e-17,2.350207e-16,-5.119125e-17,3.169902e-17,1.805569e-16,2.192586e-17,1.527027e-18,1.031910e-16,1.288604e-17
1250,0.304290,2.542348,-3.528388,2.691257,-1.236288,-1.088409,-5.540596,-3.522396,-0.340936,-1.370167,...,3.276705e-16,-4.815134e-17,-7.880062e-17,5.138131e-17,4.259139e-16,3.229058e-16,4.373754e-17,1.417066e-16,2.037645e-16,1.086776e-16
1251,-0.769923,1.764115,-2.000270,2.108429,-1.031194,4.968919,-2.852413,-6.184184,1.233666,2.435943,...,2.700722e-18,1.101160e-16,-1.229983e-16,2.524532e-16,-2.101248e-16,-5.916849e-17,3.584217e-16,-7.840479e-17,7.059607e-17,-6.486442e-17
1252,1.321503,2.452466,-4.327941,3.532730,0.981838,-2.852135,-10.326377,-1.695838,0.166382,0.580836,...,-7.147813e-18,-6.672255e-17,-1.209946e-16,-2.336284e-17,-1.717833e-16,6.352120e-17,7.368269e-17,4.694382e-17,-1.526393e-16,5.573090e-18


## Training Model

In [9]:
# Ensemble model afterwards
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

lr = LogisticRegression()
rf = RandomForestClassifier()
svc = SVC()

In [11]:
from sklearn.model_selection import cross_val_score

lr_scores = cross_val_score(lr, final_dataset_dataframe, training_label, scoring="accuracy")
print("Scores: ", lr_scores)
rf_scores = cross_val_score(rf, final_dataset_dataframe, training_label, scoring="accuracy")
print("Scores: ", rf_scores)
svc_scores = cross_val_score(svc, final_dataset_dataframe, training_label, scoring="accuracy")
print("Scores: ", svc_scores)

# Scores lr:  [0.49800797 0.73705179 0.84462151 0.87250996 0.9       ]
# Scores rf:  [0.8685259  0.92031873 0.92828685 0.92828685 0.932     ]
# Scores svc:  [0.93227092 0.93227092 0.92828685 0.92828685 0.932     ]

Scores:  [0.5059761  0.73705179 0.84462151 0.87250996 0.904     ]
Scores:  [0.93227092 0.93227092 0.92828685 0.92828685 0.932     ]
Scores:  [0.93227092 0.93227092 0.92828685 0.92828685 0.932     ]


## Grid Search