## SECOM

### Libraries

In [67]:
import pandas as pd 
import requests as rq
import numpy as np
import datetime as dt
#from sklearn import datasets
import seaborn as sn
import matplotlib.pyplot as plt
import scipy.stats as stats

#train/test
from sklearn.model_selection import train_test_split

#scaling
from sklearn.preprocessing import MinMaxScaler

# kNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

# MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

#Boruta
!pip install boruta
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### **Label File**

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'
r = rq.get(url, allow_redirects=True)

open('secomlabels.csv', 'wb').write(r.content)

40638

In [3]:
headers = ['result','datetime']
secom_label = pd.read_csv('secomlabels.csv', encoding='latin-1', sep=' ',  names=headers, index_col=False, header=None)
df = pd.DataFrame(secom_label)

In [4]:
df

Unnamed: 0,result,datetime
0,-1,19/07/2008 11:55:00
1,-1,19/07/2008 12:32:00
2,1,19/07/2008 13:17:00
3,-1,19/07/2008 14:43:00
4,-1,19/07/2008 15:22:00
...,...,...
1562,-1,16/10/2008 15:13:00
1563,-1,16/10/2008 20:49:00
1564,-1,17/10/2008 05:26:00
1565,-1,17/10/2008 06:01:00


In [5]:
df_new = df
df_new['date'] = pd.to_datetime(df['datetime']).dt.date
df_new['time'] = pd.to_datetime(df['datetime']).dt.time
df_new.drop('datetime', inplace=True, axis=1)
print(df)

      result        date      time
0         -1  2008-07-19  11:55:00
1         -1  2008-07-19  12:32:00
2          1  2008-07-19  13:17:00
3         -1  2008-07-19  14:43:00
4         -1  2008-07-19  15:22:00
...      ...         ...       ...
1562      -1  2008-10-16  15:13:00
1563      -1  2008-10-16  20:49:00
1564      -1  2008-10-17  05:26:00
1565      -1  2008-10-17  06:01:00
1566      -1  2008-10-17  06:07:00

[1567 rows x 3 columns]


In [6]:
df_new[df_new.duplicated(['date','time'], keep=False)]

Unnamed: 0,result,date,time
31,-1,2008-07-27,22:28:00
32,-1,2008-07-27,22:28:00
114,-1,2008-05-08,07:12:00
115,1,2008-05-08,07:12:00
285,-1,2008-08-19,05:11:00
...,...,...,...
1518,-1,2008-10-15,02:40:00
1545,-1,2008-10-16,02:16:00
1546,-1,2008-10-16,02:16:00
1551,-1,2008-10-16,04:02:00


### **Data File**

In [7]:
url1 = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
r1 = rq.get(url1, allow_redirects=True)

open('secomdata.csv', 'wb').write(r1.content)

5389983

In [8]:
header_data = ['Feature{}'.format(x) for x in range(1,591)]
secom_data = pd.read_csv('secomdata.csv', encoding='latin-1',sep=' ', names=header_data, index_col=False, header=None)
df_data = pd.DataFrame(secom_data)
df_data

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
0,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.3630,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.0060,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,-0.0045,...,0.0047,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720
1563,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,-0.0061,...,,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720
1564,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,,,...,0.0025,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231
1565,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,-0.0072,...,0.0075,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941


### **Merged Dataset**

In [None]:
merged_df = pd.merge(df, df_data, left_index=True, right_index=True)
merged_df

In [None]:
feature_df = merged_df.drop(['result','date','time'], axis=1)
feature_df

In [None]:
# correlations 

matrix = feature_df.corr()
matrix

### **Function Definitons**

In [10]:
# KNN imputer

def knn_impute(X):
  impute_knn = KNNImputer(n_neighbors=5)
  return impute_knn.fit_transform(X)


In [11]:
# MICE imputation

def mice_impute(X):
  lr = LinearRegression()
  impute_mice = IterativeImputer(estimator = lr, verbose = 2, max_iter = 5, tol = 1e-10, imputation_order = "roman")
  return impute_mice.fit_transform(X)

In [12]:
# Remove features with 55% missing values

def remove_na(X):
    threshold = int(0.55*X.shape[0])
    df4 = pd.DataFrame(X.isna().sum())
    rem_col = list(df4[df4[0]>=threshold].index)
    return X.drop(columns = rem_col, axis = 1)
  

In [13]:
# select columns having constant values 

def remove_constant_value_features(X):
    return [e for e in X.columns if X[e].nunique() == 1]

### **Train test Split**

In [14]:
df_non_faulty = merged_df[merged_df['result']== -1]
df_faulty = merged_df[merged_df['result']== 1]

In [15]:
faulty_train, faulty_test = train_test_split(df_non_faulty, test_size = 0.2)
nfaulty_train, nfaulty_test = train_test_split(df_faulty, test_size = 0.2)

test_data = pd.concat([faulty_test, nfaulty_test])
train_data = pd.concat([faulty_train, nfaulty_test])

In [16]:
test_data['result'].value_counts()
test_data.shape

(314, 593)

In [17]:
train_data['result'].value_counts()
train_data

(1191, 593)

In [None]:
train_data

### **Train data**

In [70]:
#remove 55% 

remove_na(train_data)

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature577,Feature578,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
1011,-1,2008-09-22,13:03:00,2957.70,2509.62,2226.3111,2252.1538,1.2295,100.0,92.6178,...,1.3646,20.3592,0.5022,0.0092,0.0026,1.8253,0.0167,0.0132,0.0039,79.1086
248,-1,2008-08-18,05:25:00,3056.95,2549.73,2195.7666,1015.3046,1.3663,100.0,101.4600,...,45.4394,27.3500,0.5041,0.0145,0.0041,2.8682,0.0123,0.0270,0.0079,220.0378
732,-1,2008-04-09,16:56:00,2992.37,2484.27,2210.9778,1572.4698,1.0204,100.0,106.2089,...,1.7311,8.5390,0.4941,0.0106,0.0029,2.1462,0.0335,0.0084,0.0030,25.1494
270,-1,2008-08-18,15:33:00,2988.52,2291.92,2183.5777,1764.5386,1.7050,100.0,100.4478,...,1.8497,18.4482,0.4999,0.0095,0.0028,1.9072,0.0274,0.0142,0.0046,51.9067
747,-1,2008-07-09,19:31:00,3057.65,2540.35,2203.4556,1441.1445,0.8264,100.0,104.6767,...,1.7010,24.3097,0.5010,0.0144,0.0035,2.8667,0.0419,0.0098,0.0032,23.3852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,1,2008-07-28,06:45:00,2958.09,2542.24,2222.6778,1547.6125,1.4431,100.0,110.5644,...,1.3166,15.6351,0.5022,0.0105,0.0028,2.0882,0.0150,0.0151,0.0047,100.7279
1238,1,2008-02-10,09:10:00,3060.00,2571.41,2199.6556,1140.3983,1.3369,100.0,103.0967,...,2.1201,13.5037,0.5013,0.0095,0.0022,1.8910,0.0193,0.0072,0.0026,37.6251
1062,1,2008-09-24,15:05:00,3244.74,2422.00,2208.5222,1838.7054,1.1571,100.0,95.2056,...,2.1334,16.2693,0.5027,0.0184,0.0038,3.6629,0.0189,0.0059,0.0017,31.0252
57,1,2008-07-30,12:29:00,2935.94,2586.05,2164.4111,1206.6031,0.9799,100.0,100.5189,...,1.7412,14.0012,0.5048,0.0138,0.0040,2.7309,0.0201,0.0220,0.0065,109.4273


In [71]:
newdff = remove_constant_value_features(train_data)
len(newdff)

116

In [72]:
# Remove featues with constant values

constant_value_colummns = remove_constant_value_features(train_data)

new_df_columns = [e for e in train_data.columns if e not in constant_value_colummns]
new_df_train = train_data[new_df_columns]
new_df_train




Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature7,Feature8,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
1011,-1,2008-09-22,13:03:00,2957.70,2509.62,2226.3111,2252.1538,1.2295,92.6178,0.1239,...,,,0.5022,0.0092,0.0026,1.8253,0.0167,0.0132,0.0039,79.1086
248,-1,2008-08-18,05:25:00,3056.95,2549.73,2195.7666,1015.3046,1.3663,101.4600,0.1207,...,0.0079,220.0378,0.5041,0.0145,0.0041,2.8682,0.0123,0.0270,0.0079,220.0378
732,-1,2008-04-09,16:56:00,2992.37,2484.27,2210.9778,1572.4698,1.0204,106.2089,0.1222,...,,,0.4941,0.0106,0.0029,2.1462,0.0335,0.0084,0.0030,25.1494
270,-1,2008-08-18,15:33:00,2988.52,2291.92,2183.5777,1764.5386,1.7050,100.4478,0.1222,...,0.0046,51.9067,0.4999,0.0095,0.0028,1.9072,0.0274,0.0142,0.0046,51.9067
747,-1,2008-07-09,19:31:00,3057.65,2540.35,2203.4556,1441.1445,0.8264,104.6767,0.1232,...,,,0.5010,0.0144,0.0035,2.8667,0.0419,0.0098,0.0032,23.3852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,1,2008-07-28,06:45:00,2958.09,2542.24,2222.6778,1547.6125,1.4431,110.5644,0.1211,...,0.0047,100.7279,0.5022,0.0105,0.0028,2.0882,0.0150,0.0151,0.0047,100.7279
1238,1,2008-02-10,09:10:00,3060.00,2571.41,2199.6556,1140.3983,1.3369,103.0967,0.1227,...,0.0026,37.6251,0.5013,0.0095,0.0022,1.8910,0.0193,0.0072,0.0026,37.6251
1062,1,2008-09-24,15:05:00,3244.74,2422.00,2208.5222,1838.7054,1.1571,95.2056,0.1249,...,,,0.5027,0.0184,0.0038,3.6629,0.0189,0.0059,0.0017,31.0252
57,1,2008-07-30,12:29:00,2935.94,2586.05,2164.4111,1206.6031,0.9799,100.5189,0.1220,...,0.0065,109.4273,0.5048,0.0138,0.0040,2.7309,0.0201,0.0220,0.0065,109.4273


In [None]:
train_data_df = new_df_train.drop(['result','date','time'], axis = 1)
train_data_df

#Use train_data_df for further operations

**imputations on train data**

In [76]:
X_train = knn_impute(train_data_df)

### boruta

In [77]:
y_train = train_data['result'].values
y_train

array([-1, -1, -1, ...,  1,  1,  1])

In [78]:
model = RandomForestClassifier(
   n_jobs = -1, 
   max_depth = 5,
   n_estimators=500,
   random_state=1
)

feat_selector = BorutaPy(
    verbose=2,
    estimator=model,
    n_estimators='auto',
    max_iter=100,
    random_state=42,
)



In [91]:
feat_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	3
Rejected: 	471
Iteration: 	16 / 100
Confirmed: 	0
Tentative: 	3


BorutaPy(estimator=RandomForestClassifier(max_depth=5, n_estimators=48,
                                          n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7FB385CD3AF0),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FB385CD3AF0, verbose=2)

In [97]:
feat_selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [96]:
feat_selector.ranking_

array([123, 429,   8, 212,  12, 182, 343, 387, 210,  75, 278, 461,  20,
       184,  62, 136, 457, 330, 404,  79,  85, 251, 423,  77, 196,  33,
       292, 332, 236, 393, 364, 195, 379, 261, 390, 230,  18, 396,  87,
       219, 217, 101, 126, 133,  98, 121, 339,   2, 380, 294, 111, 312,
       168, 298,  73, 319,  16,  20,  54,   1,  59, 139,  35,  24,  74,
         3, 370, 402, 469, 378,  23,  31, 408, 203, 298, 212, 141,  30,
       348, 276, 329, 264, 244, 397, 432, 382, 360, 328, 411, 438, 459,
       464, 406, 401, 302,   4, 194, 374, 441, 322, 402, 349, 333, 393,
       174, 172, 180, 114, 346, 324,  93, 236, 201, 247, 119, 283, 161,
       160, 186, 135, 203, 283,  57, 358, 451, 217, 264, 317, 434, 453,
       364, 117, 316, 279,  86, 447, 250, 300,   6, 149, 286, 393, 353,
       235,  31, 381, 280, 145, 414, 339,  60, 210, 413, 172, 146,  50,
       142,  26, 207,  39, 203,  47, 303, 282,  67, 368, 247, 215, 288,
       435, 312, 367, 435, 191, 223, 341,  61, 178,  52, 131,  5

In [None]:
X_filtered = feat_selector.transform(X_train)
X_filtered

In [None]:
feature_ranks = list(zip(feature_names, 
                         feat_selector.ranking_, 
                         feat_selector.support_))