<a href="https://colab.research.google.com/github/Himanshu-Dharma/secom_mpmd/blob/master/Secom_Presentation2_Himanshu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SECOM

### Libraries

In [72]:
import pandas as pd 
import requests as rq
import numpy as np
import datetime as dt
#from sklearn import datasets
import seaborn as sn
import matplotlib.pyplot as plt
import scipy.stats as stats

#train/test
from sklearn.model_selection import train_test_split

#scaling
from sklearn.preprocessing import MinMaxScaler

# kNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

# MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

### **Label File**

In [60]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'
r = rq.get(url, allow_redirects=True)

open('secomlabels.csv', 'wb').write(r.content)

40638

In [61]:
headers = ['result','datetime']
secom_label = pd.read_csv('secomlabels.csv', encoding='latin-1', sep=' ',  names=headers, index_col=False, header=None)
df = pd.DataFrame(secom_label)

In [62]:
df

Unnamed: 0,result,datetime
0,-1,19/07/2008 11:55:00
1,-1,19/07/2008 12:32:00
2,1,19/07/2008 13:17:00
3,-1,19/07/2008 14:43:00
4,-1,19/07/2008 15:22:00
...,...,...
1562,-1,16/10/2008 15:13:00
1563,-1,16/10/2008 20:49:00
1564,-1,17/10/2008 05:26:00
1565,-1,17/10/2008 06:01:00


In [63]:
df_new = df
df_new['date'] = pd.to_datetime(df['datetime']).dt.date
df_new['time'] = pd.to_datetime(df['datetime']).dt.time
df_new.drop('datetime', inplace=True, axis=1)
print(df)

      result        date      time
0         -1  2008-07-19  11:55:00
1         -1  2008-07-19  12:32:00
2          1  2008-07-19  13:17:00
3         -1  2008-07-19  14:43:00
4         -1  2008-07-19  15:22:00
...      ...         ...       ...
1562      -1  2008-10-16  15:13:00
1563      -1  2008-10-16  20:49:00
1564      -1  2008-10-17  05:26:00
1565      -1  2008-10-17  06:01:00
1566      -1  2008-10-17  06:07:00

[1567 rows x 3 columns]


In [64]:
df_new[df_new.duplicated(['date','time'], keep=False)]

Unnamed: 0,result,date,time
31,-1,2008-07-27,22:28:00
32,-1,2008-07-27,22:28:00
114,-1,2008-05-08,07:12:00
115,1,2008-05-08,07:12:00
285,-1,2008-08-19,05:11:00
...,...,...,...
1518,-1,2008-10-15,02:40:00
1545,-1,2008-10-16,02:16:00
1546,-1,2008-10-16,02:16:00
1551,-1,2008-10-16,04:02:00


### **Data File**

In [65]:
url1 = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
r1 = rq.get(url1, allow_redirects=True)

open('secomdata.csv', 'wb').write(r1.content)

5389983

In [66]:
header_data = ['Feature{}'.format(x) for x in range(1,591)]
secom_data = pd.read_csv('secomdata.csv', encoding='latin-1',sep=' ', names=header_data, index_col=False, header=None)
df_data = pd.DataFrame(secom_data)
df_data

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
0,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.3630,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.0060,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,-0.0045,...,0.0047,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720
1563,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,-0.0061,...,,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720
1564,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,,,...,0.0025,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231
1565,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,-0.0072,...,0.0075,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941


### **Merged Dataset**

In [67]:
merged_df = pd.merge(df, df_data, left_index=True, right_index=True)
merged_df

Unnamed: 0,result,date,time,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,...,Feature581,Feature582,Feature583,Feature584,Feature585,Feature586,Feature587,Feature588,Feature589,Feature590
0,-1,2008-07-19,11:55:00,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,...,,,0.5005,0.0118,0.0035,2.3630,,,,
1,-1,2008-07-19,12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,...,0.0060,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,1,2008-07-19,13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,-1,2008-07-19,14:43:00,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,...,0.0044,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,-1,2008-07-19,15:22:00,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,...,,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,-1,2008-10-16,15:13:00,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,...,0.0047,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720
1563,-1,2008-10-16,20:49:00,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,...,,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720
1564,-1,2008-10-17,05:26:00,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,...,0.0025,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231
1565,-1,2008-10-17,06:01:00,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,...,0.0075,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941


### **Function Definitons**

In [68]:
# KNN imputer

def knn_impute(X):
  impute_knn = KNNImputer(n_neighbors=5)
  return impute_knn.fit_transform(X)


In [69]:
# MICE imputation

def mice_impute(X):
  lr = LinearRegression()
  impute_mice = IterativeImputer(estimator = lr, verbose = 2, max_iter = 5, tol = 1e-10, imputation_order = "roman")
  return impute_mice.fit_transform(X)

In [70]:
# Remove features with 55% missing values

def remove_na(X):
    threshold = .55*(len(X))
    df_drop = X.dropna(axis=1, thresh=862)
    return df_drop
  

### **Train test plit**

In [73]:
df_non_faulty = merged_df[merged_df['result']== -1]
df_faulty = merged_df[merged_df['result']== 1]

In [74]:
faulty_train, faulty_test = train_test_split(df_non_faulty, test_size = 0.2)
nfaulty_train, nfaulty_test = train_test_split(df_faulty, test_size = 0.2)

test_data = pd.concat([faulty_test, nfaulty_test])
train_data = pd.concat([faulty_train, nfaulty_test])

In [83]:
test_data['result'].value_counts()
test_data.shape

(314, 593)

In [81]:
train_data['result'].value_counts()
train_data.shape

(1191, 593)

In [82]:
train_drop_na = remove_na(train_data)
train_drop_na.shape

(1191, 561)