<a href="https://colab.research.google.com/github/Himanshu-Dharma/secom_mpmd/blob/master/Secom_Presentation2_Himanshu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SECOM

### Libraries

In [None]:
import pandas as pd 
import requests as rq
import numpy as np
import datetime as dt
#from sklearn import datasets
import seaborn as sn
import matplotlib.pyplot as plt
import scipy.stats as stats

#train/test
from sklearn.model_selection import train_test_split

#scaling
from sklearn.preprocessing import MinMaxScaler

# kNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

# MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

#Boruta
!pip install boruta
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

from sklearn.metrics import accuracy_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### **Label File**

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'
r = rq.get(url, allow_redirects=True)

open('secomlabels.csv', 'wb').write(r.content)

40638

In [None]:
headers = ['result','datetime']
secom_label = pd.read_csv('secomlabels.csv', encoding='latin-1', sep=' ',  names=headers, index_col=False, header=None)
df = pd.DataFrame(secom_label)

In [None]:
df

Unnamed: 0,result,datetime
0,-1,19/07/2008 11:55:00
1,-1,19/07/2008 12:32:00
2,1,19/07/2008 13:17:00
3,-1,19/07/2008 14:43:00
4,-1,19/07/2008 15:22:00
...,...,...
1562,-1,16/10/2008 15:13:00
1563,-1,16/10/2008 20:49:00
1564,-1,17/10/2008 05:26:00
1565,-1,17/10/2008 06:01:00


In [None]:
df_new = df
df_new['date'] = pd.to_datetime(df['datetime']).dt.date
df_new['time'] = pd.to_datetime(df['datetime']).dt.time
df_new.drop('datetime', inplace=True, axis=1)
print(df)

      result        date      time
0         -1  2008-07-19  11:55:00
1         -1  2008-07-19  12:32:00
2          1  2008-07-19  13:17:00
3         -1  2008-07-19  14:43:00
4         -1  2008-07-19  15:22:00
...      ...         ...       ...
1562      -1  2008-10-16  15:13:00
1563      -1  2008-10-16  20:49:00
1564      -1  2008-10-17  05:26:00
1565      -1  2008-10-17  06:01:00
1566      -1  2008-10-17  06:07:00

[1567 rows x 3 columns]


In [None]:
df_new[df_new.duplicated(['date','time'], keep=False)]

Unnamed: 0,result,date,time
31,-1,2008-07-27,22:28:00
32,-1,2008-07-27,22:28:00
114,-1,2008-05-08,07:12:00
115,1,2008-05-08,07:12:00
285,-1,2008-08-19,05:11:00
...,...,...,...
1518,-1,2008-10-15,02:40:00
1545,-1,2008-10-16,02:16:00
1546,-1,2008-10-16,02:16:00
1551,-1,2008-10-16,04:02:00


### **Data File**

In [None]:
url1 = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
r1 = rq.get(url1, allow_redirects=True)

open('secomdata.csv', 'wb').write(r1.content)

5389983

In [None]:
header_data = ['Feature{}'.format(x) for x in range(1,591)]
secom_data = pd.read_csv('secomdata.csv', encoding='latin-1',sep=' ', names=header_data, index_col=False, header=None)
df_data = pd.DataFrame(secom_data)
df_data

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
0,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.3630,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.0060,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,-0.0045,...,0.0047,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720
1563,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,-0.0061,...,,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720
1564,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,,,...,0.0025,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231
1565,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,-0.0072,...,0.0075,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941


### **Merged Dataset**

In [None]:
merged_df = pd.merge(df, df_data, left_index=True, right_index=True)
merged_df

In [None]:
feature_df = merged_df.drop(['result','date','time'], axis=1)
feature_df

In [None]:
# correlations 

matrix = feature_df.corr()
matrix

### **Function Definitons**

In [None]:
# KNN imputer

def knn_impute(X):
  impute_knn = KNNImputer(n_neighbors=5)
  return impute_knn.fit_transform(X)


In [None]:
# MICE imputation

def mice_impute(X):
  lr = LinearRegression()
  impute_mice = IterativeImputer(estimator = lr, verbose = 2, max_iter = 5, tol = 1e-10, imputation_order = "roman")
  return impute_mice.fit_transform(X)

In [None]:
# Remove features with 55% missing values

def remove_na(X):
    threshold = int(0.55*X.shape[0])
    df4 = pd.DataFrame(X.isna().sum())
    rem_col = list(df4[df4[0]>=threshold].index)
    return X.drop(columns = rem_col, axis = 1)
  

In [None]:
# select columns having constant values 

def remove_constant_value_features(X):
    return [e for e in X.columns if X[e].nunique() == 1]

### **Train test Split**

In [None]:
df_non_faulty = merged_df[merged_df['result']== -1]
df_faulty = merged_df[merged_df['result']== 1]

In [None]:
faulty_train, faulty_test = train_test_split(df_non_faulty, test_size = 0.2)
nfaulty_train, nfaulty_test = train_test_split(df_faulty, test_size = 0.2)

test_data = pd.concat([faulty_test, nfaulty_test])
train_data = pd.concat([faulty_train, nfaulty_test])

In [None]:
test_data['result'].value_counts()
test_data.shape

(314, 593)

In [None]:
train_data['result'].value_counts()
train_data

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
26,-1,2008-07-27,11:10:00,3067.35,2456.33,2257.1667,1437.9565,1.4918,100.0,106.3400,...,,,0.5006,0.0083,0.0022,1.6593,0.0288,0.0361,0.0101,125.0600
496,-1,2008-08-28,21:03:00,2964.74,2480.51,2163.5889,1448.3869,1.7014,100.0,104.8333,...,,,0.4962,0.0187,0.0044,3.7650,0.0255,0.0260,0.0071,102.1652
1262,-1,2008-03-10,02:41:00,2982.67,2541.55,2173.4889,1145.7970,0.9402,100.0,104.0556,...,,,0.4949,0.0146,0.0033,2.9448,0.0137,0.0326,0.0108,237.4625
910,-1,2008-09-19,11:54:00,2986.28,2483.71,2200.9889,1054.5240,1.3830,100.0,100.1800,...,0.0038,39.6224,0.4969,0.0114,0.0028,2.2966,0.0368,0.0146,0.0038,39.6224
522,-1,2008-08-29,07:24:00,2960.22,2502.25,2183.3111,1588.5090,1.6269,100.0,102.8467,...,,,0.4994,0.0107,0.0029,2.1456,0.0331,0.0210,0.0060,63.2615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,1,2008-08-22,19:14:00,2977.43,2297.30,2218.0555,1517.4371,0.8579,100.0,105.8133,...,,,0.4970,0.0174,0.0041,3.4919,0.0005,0.0115,0.0037,0.0000
576,1,2008-08-30,09:45:00,2990.72,2425.46,2155.6333,1070.0439,0.8024,100.0,101.4333,...,0.0039,76.6094,0.5088,0.0275,0.0062,5.4146,0.0182,0.0139,0.0039,76.6094
50,1,2008-07-29,18:08:00,2942.31,2446.74,2172.9778,1222.6067,1.3658,100.0,101.8400,...,,,0.5026,0.0098,0.0026,1.9519,0.0296,0.0165,0.0055,55.8324
186,1,2008-10-08,15:59:00,2936.64,2509.65,2221.9444,1551.6947,1.5296,100.0,99.2678,...,0.0054,118.2289,0.4987,0.0139,0.0041,2.7935,0.0158,0.0186,0.0054,118.2289


In [None]:
train_data

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
26,-1,2008-07-27,11:10:00,3067.35,2456.33,2257.1667,1437.9565,1.4918,100.0,106.3400,...,,,0.5006,0.0083,0.0022,1.6593,0.0288,0.0361,0.0101,125.0600
496,-1,2008-08-28,21:03:00,2964.74,2480.51,2163.5889,1448.3869,1.7014,100.0,104.8333,...,,,0.4962,0.0187,0.0044,3.7650,0.0255,0.0260,0.0071,102.1652
1262,-1,2008-03-10,02:41:00,2982.67,2541.55,2173.4889,1145.7970,0.9402,100.0,104.0556,...,,,0.4949,0.0146,0.0033,2.9448,0.0137,0.0326,0.0108,237.4625
910,-1,2008-09-19,11:54:00,2986.28,2483.71,2200.9889,1054.5240,1.3830,100.0,100.1800,...,0.0038,39.6224,0.4969,0.0114,0.0028,2.2966,0.0368,0.0146,0.0038,39.6224
522,-1,2008-08-29,07:24:00,2960.22,2502.25,2183.3111,1588.5090,1.6269,100.0,102.8467,...,,,0.4994,0.0107,0.0029,2.1456,0.0331,0.0210,0.0060,63.2615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,1,2008-08-22,19:14:00,2977.43,2297.30,2218.0555,1517.4371,0.8579,100.0,105.8133,...,,,0.4970,0.0174,0.0041,3.4919,0.0005,0.0115,0.0037,0.0000
576,1,2008-08-30,09:45:00,2990.72,2425.46,2155.6333,1070.0439,0.8024,100.0,101.4333,...,0.0039,76.6094,0.5088,0.0275,0.0062,5.4146,0.0182,0.0139,0.0039,76.6094
50,1,2008-07-29,18:08:00,2942.31,2446.74,2172.9778,1222.6067,1.3658,100.0,101.8400,...,,,0.5026,0.0098,0.0026,1.9519,0.0296,0.0165,0.0055,55.8324
186,1,2008-10-08,15:59:00,2936.64,2509.65,2221.9444,1551.6947,1.5296,100.0,99.2678,...,0.0054,118.2289,0.4987,0.0139,0.0041,2.7935,0.0158,0.0186,0.0054,118.2289


### **Train data**

In [None]:
#remove 55% 

remove_na(train_data)

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature577,Feature578,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
1011,-1,2008-09-22,13:03:00,2957.70,2509.62,2226.3111,2252.1538,1.2295,100.0,92.6178,...,1.3646,20.3592,0.5022,0.0092,0.0026,1.8253,0.0167,0.0132,0.0039,79.1086
248,-1,2008-08-18,05:25:00,3056.95,2549.73,2195.7666,1015.3046,1.3663,100.0,101.4600,...,45.4394,27.3500,0.5041,0.0145,0.0041,2.8682,0.0123,0.0270,0.0079,220.0378
732,-1,2008-04-09,16:56:00,2992.37,2484.27,2210.9778,1572.4698,1.0204,100.0,106.2089,...,1.7311,8.5390,0.4941,0.0106,0.0029,2.1462,0.0335,0.0084,0.0030,25.1494
270,-1,2008-08-18,15:33:00,2988.52,2291.92,2183.5777,1764.5386,1.7050,100.0,100.4478,...,1.8497,18.4482,0.4999,0.0095,0.0028,1.9072,0.0274,0.0142,0.0046,51.9067
747,-1,2008-07-09,19:31:00,3057.65,2540.35,2203.4556,1441.1445,0.8264,100.0,104.6767,...,1.7010,24.3097,0.5010,0.0144,0.0035,2.8667,0.0419,0.0098,0.0032,23.3852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,1,2008-07-28,06:45:00,2958.09,2542.24,2222.6778,1547.6125,1.4431,100.0,110.5644,...,1.3166,15.6351,0.5022,0.0105,0.0028,2.0882,0.0150,0.0151,0.0047,100.7279
1238,1,2008-02-10,09:10:00,3060.00,2571.41,2199.6556,1140.3983,1.3369,100.0,103.0967,...,2.1201,13.5037,0.5013,0.0095,0.0022,1.8910,0.0193,0.0072,0.0026,37.6251
1062,1,2008-09-24,15:05:00,3244.74,2422.00,2208.5222,1838.7054,1.1571,100.0,95.2056,...,2.1334,16.2693,0.5027,0.0184,0.0038,3.6629,0.0189,0.0059,0.0017,31.0252
57,1,2008-07-30,12:29:00,2935.94,2586.05,2164.4111,1206.6031,0.9799,100.0,100.5189,...,1.7412,14.0012,0.5048,0.0138,0.0040,2.7309,0.0201,0.0220,0.0065,109.4273


In [None]:
newdff = remove_constant_value_features(train_data)
len(newdff)

116

In [None]:
# Remove featues with constant values

constant_value_colummns = remove_constant_value_features(train_data)

new_df_columns = [e for e in train_data.columns if e not in constant_value_colummns]
new_df_train = train_data[new_df_columns]
new_df_train




Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature7,Feature8,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
1011,-1,2008-09-22,13:03:00,2957.70,2509.62,2226.3111,2252.1538,1.2295,92.6178,0.1239,...,,,0.5022,0.0092,0.0026,1.8253,0.0167,0.0132,0.0039,79.1086
248,-1,2008-08-18,05:25:00,3056.95,2549.73,2195.7666,1015.3046,1.3663,101.4600,0.1207,...,0.0079,220.0378,0.5041,0.0145,0.0041,2.8682,0.0123,0.0270,0.0079,220.0378
732,-1,2008-04-09,16:56:00,2992.37,2484.27,2210.9778,1572.4698,1.0204,106.2089,0.1222,...,,,0.4941,0.0106,0.0029,2.1462,0.0335,0.0084,0.0030,25.1494
270,-1,2008-08-18,15:33:00,2988.52,2291.92,2183.5777,1764.5386,1.7050,100.4478,0.1222,...,0.0046,51.9067,0.4999,0.0095,0.0028,1.9072,0.0274,0.0142,0.0046,51.9067
747,-1,2008-07-09,19:31:00,3057.65,2540.35,2203.4556,1441.1445,0.8264,104.6767,0.1232,...,,,0.5010,0.0144,0.0035,2.8667,0.0419,0.0098,0.0032,23.3852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,1,2008-07-28,06:45:00,2958.09,2542.24,2222.6778,1547.6125,1.4431,110.5644,0.1211,...,0.0047,100.7279,0.5022,0.0105,0.0028,2.0882,0.0150,0.0151,0.0047,100.7279
1238,1,2008-02-10,09:10:00,3060.00,2571.41,2199.6556,1140.3983,1.3369,103.0967,0.1227,...,0.0026,37.6251,0.5013,0.0095,0.0022,1.8910,0.0193,0.0072,0.0026,37.6251
1062,1,2008-09-24,15:05:00,3244.74,2422.00,2208.5222,1838.7054,1.1571,95.2056,0.1249,...,,,0.5027,0.0184,0.0038,3.6629,0.0189,0.0059,0.0017,31.0252
57,1,2008-07-30,12:29:00,2935.94,2586.05,2164.4111,1206.6031,0.9799,100.5189,0.1220,...,0.0065,109.4273,0.5048,0.0138,0.0040,2.7309,0.0201,0.0220,0.0065,109.4273


In [None]:
train_data_df = new_df_train.drop(['result','date','time'], axis = 1)
train_data_df

#Use train_data_df for further operations

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature7,Feature8,Feature9,Feature10,Feature11,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
1011,2957.70,2509.62,2226.3111,2252.1538,1.2295,92.6178,0.1239,1.3561,0.0045,-0.0011,...,,,0.5022,0.0092,0.0026,1.8253,0.0167,0.0132,0.0039,79.1086
248,3056.95,2549.73,2195.7666,1015.3046,1.3663,101.4600,0.1207,1.4493,0.0176,-0.0025,...,0.0079,220.0378,0.5041,0.0145,0.0041,2.8682,0.0123,0.0270,0.0079,220.0378
732,2992.37,2484.27,2210.9778,1572.4698,1.0204,106.2089,0.1222,1.5031,0.0145,-0.0005,...,,,0.4941,0.0106,0.0029,2.1462,0.0335,0.0084,0.0030,25.1494
270,2988.52,2291.92,2183.5777,1764.5386,1.7050,100.4478,0.1222,1.4305,0.0001,-0.0054,...,0.0046,51.9067,0.4999,0.0095,0.0028,1.9072,0.0274,0.0142,0.0046,51.9067
747,3057.65,2540.35,2203.4556,1441.1445,0.8264,104.6767,0.1232,1.5527,-0.0216,0.0013,...,,,0.5010,0.0144,0.0035,2.8667,0.0419,0.0098,0.0032,23.3852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2958.09,2542.24,2222.6778,1547.6125,1.4431,110.5644,0.1211,1.4780,-0.0131,0.0079,...,0.0047,100.7279,0.5022,0.0105,0.0028,2.0882,0.0150,0.0151,0.0047,100.7279
1238,3060.00,2571.41,2199.6556,1140.3983,1.3369,103.0967,0.1227,1.4300,0.0129,-0.0088,...,0.0026,37.6251,0.5013,0.0095,0.0022,1.8910,0.0193,0.0072,0.0026,37.6251
1062,3244.74,2422.00,2208.5222,1838.7054,1.1571,95.2056,0.1249,1.5575,0.0049,-0.0207,...,,,0.5027,0.0184,0.0038,3.6629,0.0189,0.0059,0.0017,31.0252
57,2935.94,2586.05,2164.4111,1206.6031,0.9799,100.5189,0.1220,1.4786,0.0049,0.0116,...,0.0065,109.4273,0.5048,0.0138,0.0040,2.7309,0.0201,0.0220,0.0065,109.4273


**imputations on train data**

In [None]:
X_train = knn_impute(train_data_df)

### Test data

In [None]:
test_data

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
721,-1,2008-04-09,08:01:00,3000.68,2432.18,2198.2667,986.5558,0.8652,100.0,105.0589,...,,,0.4972,0.0117,0.0032,2.3527,0.0121,0.0121,0.0040,100.3091
237,-1,2008-08-17,22:21:00,2908.04,2494.37,2185.9333,1659.6962,1.6290,100.0,98.6822,...,0.0025,30.7439,0.5037,0.0115,0.0031,2.2928,0.0261,0.0080,0.0025,30.7439
1352,-1,2008-06-10,07:35:00,2985.31,2584.24,2209.6667,1586.6088,1.6782,100.0,89.7222,...,,,0.4987,0.0114,0.0033,2.2939,-0.0060,0.0078,0.0024,129.6218
1406,-1,2008-07-10,18:39:00,3094.77,2454.33,2215.8111,1389.3065,2.3183,100.0,98.4500,...,,,0.5003,0.0159,0.0039,3.1780,0.0458,0.0277,0.0100,60.5430
1099,-1,2008-09-26,05:32:00,3160.54,2304.83,2233.6334,1962.0026,1.2120,100.0,99.0133,...,,,0.4981,0.0142,0.0035,2.8606,0.0179,0.0079,0.0024,44.1194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,1,2008-08-22,19:14:00,2977.43,2297.30,2218.0555,1517.4371,0.8579,100.0,105.8133,...,,,0.4970,0.0174,0.0041,3.4919,0.0005,0.0115,0.0037,0.0000
576,1,2008-08-30,09:45:00,2990.72,2425.46,2155.6333,1070.0439,0.8024,100.0,101.4333,...,0.0039,76.6094,0.5088,0.0275,0.0062,5.4146,0.0182,0.0139,0.0039,76.6094
50,1,2008-07-29,18:08:00,2942.31,2446.74,2172.9778,1222.6067,1.3658,100.0,101.8400,...,,,0.5026,0.0098,0.0026,1.9519,0.0296,0.0165,0.0055,55.8324
186,1,2008-10-08,15:59:00,2936.64,2509.65,2221.9444,1551.6947,1.5296,100.0,99.2678,...,0.0054,118.2289,0.4987,0.0139,0.0041,2.7935,0.0158,0.0186,0.0054,118.2289


In [None]:
test_data
y_test = test_data['result'].values
X_test1 = test_data.drop(['result','time','date'], axis = 1)

In [None]:
X_test_remove_na = remove_na(test_data)
X_test_remove_na

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature577,Feature578,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
721,-1,2008-04-09,08:01:00,3000.68,2432.18,2198.2667,986.5558,0.8652,100.0,105.0589,...,1.3108,8.5456,0.4972,0.0117,0.0032,2.3527,0.0121,0.0121,0.0040,100.3091
237,-1,2008-08-17,22:21:00,2908.04,2494.37,2185.9333,1659.6962,1.6290,100.0,98.6822,...,1.7429,14.9904,0.5037,0.0115,0.0031,2.2928,0.0261,0.0080,0.0025,30.7439
1352,-1,2008-06-10,07:35:00,2985.31,2584.24,2209.6667,1586.6088,1.6782,100.0,89.7222,...,1.9267,11.7201,0.4987,0.0114,0.0033,2.2939,-0.0060,0.0078,0.0024,129.6218
1406,-1,2008-07-10,18:39:00,3094.77,2454.33,2215.8111,1389.3065,2.3183,100.0,98.4500,...,1.9267,11.7201,0.5003,0.0159,0.0039,3.1780,0.0458,0.0277,0.0100,60.5430
1099,-1,2008-09-26,05:32:00,3160.54,2304.83,2233.6334,1962.0026,1.2120,100.0,99.0133,...,1.8176,12.6151,0.4981,0.0142,0.0035,2.8606,0.0179,0.0079,0.0024,44.1194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,1,2008-08-22,19:14:00,2977.43,2297.30,2218.0555,1517.4371,0.8579,100.0,105.8133,...,1.3400,11.3021,0.4970,0.0174,0.0041,3.4919,0.0005,0.0115,0.0037,0.0000
576,1,2008-08-30,09:45:00,2990.72,2425.46,2155.6333,1070.0439,0.8024,100.0,101.4333,...,1.3166,10.9822,0.5088,0.0275,0.0062,5.4146,0.0182,0.0139,0.0039,76.6094
50,1,2008-07-29,18:08:00,2942.31,2446.74,2172.9778,1222.6067,1.3658,100.0,101.8400,...,1.2361,25.4953,0.5026,0.0098,0.0026,1.9519,0.0296,0.0165,0.0055,55.8324
186,1,2008-10-08,15:59:00,2936.64,2509.65,2221.9444,1551.6947,1.5296,100.0,99.2678,...,1.7630,17.2239,0.4987,0.0139,0.0041,2.7935,0.0158,0.0186,0.0054,118.2289


In [None]:
dfff = remove_constant_value_features(X_test_remove_na)
len(dfff)

122

In [None]:
# Remove featues with constant values

constant_value_colummns = remove_constant_value_features(X_test_remove_na)

new_df_columns = [e for e in X_test_remove_na.columns if e not in constant_value_colummns]
new_df_test = X_test_remove_na[new_df_columns]
new_df_test

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature7,Feature8,...,Feature577,Feature578,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
721,-1,2008-04-09,08:01:00,3000.68,2432.18,2198.2667,986.5558,0.8652,105.0589,0.1209,...,1.3108,8.5456,0.4972,0.0117,0.0032,2.3527,0.0121,0.0121,0.0040,100.3091
237,-1,2008-08-17,22:21:00,2908.04,2494.37,2185.9333,1659.6962,1.6290,98.6822,0.1227,...,1.7429,14.9904,0.5037,0.0115,0.0031,2.2928,0.0261,0.0080,0.0025,30.7439
1352,-1,2008-06-10,07:35:00,2985.31,2584.24,2209.6667,1586.6088,1.6782,89.7222,0.1213,...,1.9267,11.7201,0.4987,0.0114,0.0033,2.2939,-0.0060,0.0078,0.0024,129.6218
1406,-1,2008-07-10,18:39:00,3094.77,2454.33,2215.8111,1389.3065,2.3183,98.4500,0.1214,...,1.9267,11.7201,0.5003,0.0159,0.0039,3.1780,0.0458,0.0277,0.0100,60.5430
1099,-1,2008-09-26,05:32:00,3160.54,2304.83,2233.6334,1962.0026,1.2120,99.0133,0.1222,...,1.8176,12.6151,0.4981,0.0142,0.0035,2.8606,0.0179,0.0079,0.0024,44.1194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,1,2008-08-22,19:14:00,2977.43,2297.30,2218.0555,1517.4371,0.8579,105.8133,0.1206,...,1.3400,11.3021,0.4970,0.0174,0.0041,3.4919,0.0005,0.0115,0.0037,0.0000
576,1,2008-08-30,09:45:00,2990.72,2425.46,2155.6333,1070.0439,0.8024,101.4333,0.1241,...,1.3166,10.9822,0.5088,0.0275,0.0062,5.4146,0.0182,0.0139,0.0039,76.6094
50,1,2008-07-29,18:08:00,2942.31,2446.74,2172.9778,1222.6067,1.3658,101.8400,0.1220,...,1.2361,25.4953,0.5026,0.0098,0.0026,1.9519,0.0296,0.0165,0.0055,55.8324
186,1,2008-10-08,15:59:00,2936.64,2509.65,2221.9444,1551.6947,1.5296,99.2678,0.1222,...,1.7630,17.2239,0.4987,0.0139,0.0041,2.7935,0.0158,0.0186,0.0054,118.2289


In [None]:
X_test = knn_impute(X_test_remove_na)

### boruta

In [None]:
y_train = train_data['result'].values
y_train

array([-1, -1, -1, ...,  1,  1,  1])

In [None]:
model = RandomForestClassifier(
   n_jobs = -1, 
   max_depth = 5,
   n_estimators=500,
   random_state=1
)

feat_selector = BorutaPy(
    verbose=2,
    estimator=model,
    n_estimators='auto',
    max_iter=100,
    random_state=42,
)



In [None]:
feat_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	474
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	463
Iteration: 	16 / 100
Confirmed: 	0
Tentat

BorutaPy(estimator=RandomForestClassifier(max_depth=5, n_estimators=93,
                                          n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7FB385CD3AF0),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FB385CD3AF0, verbose=2)

In [None]:
feat_selector.support_

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [None]:
feat_selector.ranking_

In [None]:
X_filtered = feat_selector.transform(X_train)
X_filtered

In [None]:
feature_ranks = list(zip(feature_names, 
                         feat_selector.ranking_, 
                         feat_selector.support_))

### BORUTA 2

In [None]:
rf_all_features = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5)
rf_all_features.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=1000, random_state=1)

In [None]:
accuracy_score(y_test, rf_all_features.predict(X_test))

ValueError: ignored