In [1]:
%matplotlib inline 
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import os, sys, pickle

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Loading Data

In [2]:
df_header = pd.read_csv('secom_labels.data.txt', header=None, sep=' ')
df_value = pd.read_csv('secom.data.txt', header=None, sep=' ')

df_header
df_value

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.3630,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.0060,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,-0.0045,...,0.0047,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720
1563,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,-0.0061,...,,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720
1564,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,,,...,0.0025,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231
1565,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,-0.0072,...,0.0075,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941


In [3]:
df_header.columns = ['Label', 'Time']  # Rename existing columns
df_header['Time'] = pd.to_datetime(df_header['Time'], format='%d/%m/%Y %H:%M:%S')  # Convert 'Time' to datetime

# Verify the changes
print("DataFrame with renamed columns and datetime conversion:")
print(df_header.head())

DataFrame with renamed columns and datetime conversion:
   Label                Time
0     -1 2008-07-19 11:55:00
1     -1 2008-07-19 12:32:00
2      1 2008-07-19 13:17:00
3     -1 2008-07-19 14:43:00
4     -1 2008-07-19 15:22:00


In [4]:
n_value_columns = df_value.shape[1]
df_value.columns = [f'Measment {i}' for i in range(n_value_columns)]

# Verify the changes
print("DataFrame df_value with new column names:")
print(df_value.head())

DataFrame df_value with new column names:
   Measment 0  Measment 1  Measment 2  Measment 3  Measment 4  Measment 5  \
0     3030.93     2564.00   2187.7333   1411.1265      1.3602       100.0   
1     3095.78     2465.14   2230.4222   1463.6606      0.8294       100.0   
2     2932.61     2559.94   2186.4111   1698.0172      1.5102       100.0   
3     2988.72     2479.90   2199.0333    909.7926      1.3204       100.0   
4     3032.24     2502.87   2233.3667   1326.5200      1.5334       100.0   

   Measment 6  Measment 7  Measment 8  Measment 9  ...  Measment 580  \
0     97.6133      0.1242      1.5005      0.0162  ...           NaN   
1    102.3433      0.1247      1.4966     -0.0005  ...        0.0060   
2     95.4878      0.1241      1.4436      0.0041  ...        0.0148   
3    104.2367      0.1217      1.4882     -0.0124  ...        0.0044   
4    100.3967      0.1235      1.5031     -0.0031  ...           NaN   

   Measment 581  Measment 582  Measment 583  Measment 584  Mea

In [5]:
merged_df = pd.concat([df_header, df_value], axis=1)
df = merged_df
print("Merged DataFrame:")
print(df.head())

Merged DataFrame:
   Label                Time  Measment 0  Measment 1  Measment 2  Measment 3  \
0     -1 2008-07-19 11:55:00     3030.93     2564.00   2187.7333   1411.1265   
1     -1 2008-07-19 12:32:00     3095.78     2465.14   2230.4222   1463.6606   
2      1 2008-07-19 13:17:00     2932.61     2559.94   2186.4111   1698.0172   
3     -1 2008-07-19 14:43:00     2988.72     2479.90   2199.0333    909.7926   
4     -1 2008-07-19 15:22:00     3032.24     2502.87   2233.3667   1326.5200   

   Measment 4  Measment 5  Measment 6  Measment 7  ...  Measment 580  \
0      1.3602       100.0     97.6133      0.1242  ...           NaN   
1      0.8294       100.0    102.3433      0.1247  ...        0.0060   
2      1.5102       100.0     95.4878      0.1241  ...        0.0148   
3      1.3204       100.0    104.2367      0.1217  ...        0.0044   
4      1.5334       100.0    100.3967      0.1235  ...           NaN   

   Measment 581  Measment 582  Measment 583  Measment 584  Measment 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Label to Measment 589
dtypes: datetime64[ns](1), float64(590), int64(1)
memory usage: 7.1 MB


In [7]:
display(df.head())

Unnamed: 0,Label,Time,Measment 0,Measment 1,Measment 2,Measment 3,Measment 4,Measment 5,Measment 6,Measment 7,...,Measment 580,Measment 581,Measment 582,Measment 583,Measment 584,Measment 585,Measment 586,Measment 587,Measment 588,Measment 589
0,-1,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,-1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,1,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,-1,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,-1,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


# Data Preprocessing

#### Feature scaling

In [8]:
numeric_columns = df.columns[2:]
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
print("Normalized DataFrame:")
df.head()

Normalized DataFrame:


Unnamed: 0,Label,Time,Measment 0,Measment 1,Measment 2,Measment 3,Measment 4,Measment 5,Measment 6,Measment 7,...,Measment 580,Measment 581,Measment 582,Measment 583,Measment 584,Measment 585,Measment 586,Measment 587,Measment 588,Measment 589
0,-1,2008-07-19 11:55:00,0.223879,0.847825,-0.43432,0.033405,-0.050354,0.0,-0.561266,0.265504,...,,,0.118661,-0.204825,-0.093178,-0.19705,,,,
1,-1,2008-07-19 12:32:00,1.105015,-0.382054,1.012583,0.152382,-0.059776,0.0,0.19733,0.321317,...,0.194011,1.260949,0.530034,0.406549,0.444564,0.384936,-0.959868,0.411722,0.250045,1.15632
2,1,2008-07-19 13:17:00,-1.112023,0.797316,-0.479135,0.683141,-0.047691,0.0,-0.902153,0.254341,...,3.020445,-0.172375,-1.262377,0.022257,0.014371,0.029823,2.990196,3.625906,3.320359,-0.179091
3,-1,2008-07-19 14:43:00,-0.34964,-0.198431,-0.051316,-1.101992,-0.05106,0.0,0.500993,-0.013563,...,-0.319886,-0.275485,-0.322096,-0.292164,-0.362049,-0.283326,-0.101862,-0.17887,-0.308194,-0.275158
4,-1,2008-07-19 15:22:00,0.241679,0.08733,1.112384,-0.158208,-0.04728,0.0,-0.114865,0.187365,...,,,-5.905014,26.858657,27.062785,26.904758,-0.101862,-0.17887,-0.308194,-0.275158


### Feature Selection

In [9]:
correlation_matrix = df.corr(method='pearson')
print("Pearson Correlation Coefficient Matrix:")
correlation_matrix

  correlation_matrix = df.corr(method='pearson')


Pearson Correlation Coefficient Matrix:


Unnamed: 0,Label,Measment 0,Measment 1,Measment 2,Measment 3,Measment 4,Measment 5,Measment 6,Measment 7,Measment 8,...,Measment 580,Measment 581,Measment 582,Measment 583,Measment 584,Measment 585,Measment 586,Measment 587,Measment 588,Measment 589
Label,1.000000,-0.025144,-0.002615,-0.000957,-0.024631,-0.013760,,0.016244,0.012993,0.028018,...,0.027367,-0.020748,0.047021,0.005981,0.005419,0.005034,0.004157,0.035392,0.031168,-0.002654
Measment 0,-0.025144,1.000000,-0.145071,0.004775,-0.007655,-0.011047,,0.002281,0.031510,-0.052731,...,-0.070137,-0.028380,0.000225,0.023469,0.019921,0.023605,0.018472,-0.025909,-0.028196,0.004177
Measment 1,-0.002615,-0.145071,1.000000,0.005802,-0.007603,-0.001641,,-0.025702,-0.012084,0.031321,...,0.073211,0.083463,0.043690,0.002905,-0.001264,0.002273,-0.009417,0.017290,0.010134,0.044834
Measment 2,-0.000957,0.004775,0.005802,1.000000,0.298935,0.095891,,-0.136225,-0.273970,0.023609,...,-0.018721,-0.010759,-0.006061,0.015711,0.018237,0.015765,-0.025548,-0.029479,-0.030943,-0.033226
Measment 3,-0.024631,-0.007655,-0.007603,0.298935,1.000000,-0.058483,,-0.685835,0.138290,-0.103656,...,-0.057051,-0.096619,0.009045,0.025461,0.024754,0.026043,0.034779,-0.039309,-0.033780,-0.081157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Measment 585,0.005034,0.023605,0.002273,0.015765,0.026043,-0.001617,,-0.039569,0.010345,0.017930,...,-0.000395,-0.000673,-0.197363,0.999890,0.995342,1.000000,0.002744,-0.002931,-0.002531,-0.003801
Measment 586,0.004157,0.018472,-0.009417,-0.025548,0.034779,-0.044016,,-0.041296,0.058171,0.010436,...,0.249926,-0.475397,-0.016726,0.002257,0.001606,0.002744,1.000000,0.167913,0.164238,-0.486559
Measment 587,0.035392,-0.025909,0.017290,-0.029479,-0.039309,-0.031145,,0.034184,-0.021472,0.022853,...,0.975470,0.396369,-0.024481,-0.002650,-0.002498,-0.002931,0.167913,1.000000,0.974276,0.390813
Measment 588,0.031168,-0.028196,0.010134,-0.030943,-0.033780,-0.026204,,0.032359,-0.020962,0.026261,...,1.000000,0.379167,-0.020712,-0.002261,-0.001957,-0.002531,0.164238,0.974276,1.000000,0.389211


In [10]:
corr_with_label = correlation_matrix['Label'].abs()
sorted_corr = corr_with_label.sort_values(ascending=False)
top_measurements = sorted_corr.index[1:11]  # Select top 10 measurements (excluding 'Label' itself)

print("Top 10 influential measurements:")
print(top_measurements)

Top 10 influential measurements:
Index(['Measment 59', 'Measment 103', 'Measment 510', 'Measment 348',
       'Measment 158', 'Measment 431', 'Measment 293', 'Measment 111',
       'Measment 434', 'Measment 430'],
      dtype='object')


#### Feature Reduction

In [11]:
top_measurements = sorted_corr.index[1:11]  # Select top 10 measurements excluding 'Label'
columns_to_keep = ['Label'] + list(top_measurements)
df_filtered = df[columns_to_keep]
print("Filtered DataFrame with influential measurements:")
print(df_filtered.head())

Filtered DataFrame with influential measurements:
   Label  Measment 59  Measment 103  Measment 510  Measment 348  Measment 158  \
0     -1    -0.491821      1.824366      0.236392     -0.270326           NaN   
1     -1    -0.225932      1.726441      2.273718     -0.885912           NaN   
2      1     2.189516      2.346633      4.910157     -1.197920           NaN   
3     -1     2.247716      1.269458      1.533871      1.348747           NaN   
4     -1    -1.600848      1.726441      2.449670     -0.008914           NaN   

   Measment 431  Measment 293  Measment 111  Measment 434  Measment 430  
0      0.296778           NaN           NaN     -0.137359      0.009590  
1     -0.552246           NaN           NaN     -0.178906     -0.221460  
2      0.187116           NaN           NaN     -0.014184     -0.224847  
3     -0.246077           NaN           NaN     -0.280308     -0.075251  
4     -0.319354           NaN           NaN     -0.097588     -0.156938  


# Splitting Dataset

In [12]:
X = df_filtered.drop('Label', axis=1)  # Features
y = df_filtered['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (1253, 10)
Shape of X_test: (314, 10)
Shape of y_train: (1253,)
Shape of y_test: (314,)


# Model Selection

In [13]:
nb_model = GaussianNB()

## Model Training

In [14]:
imputer = SimpleImputer(strategy='mean')  # You can change strategy as per your data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
nb_model.fit(X_train_imputed, y_train)

## Model Testing

In [15]:

nb_model.fit(X_test_imputed, y_test)

In [16]:
y_pred = nb_model.predict(X_test_imputed)

## Model Evaluation

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 88.54%


## Model Classification Report

In [18]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

          -1       0.93      0.95      0.94       290
           1       0.17      0.12      0.14        24

    accuracy                           0.89       314
   macro avg       0.55      0.54      0.54       314
weighted avg       0.87      0.89      0.88       314

