# 2.0 DATA UNDERSTANDING STAGE

### 2.1 libraries

##### 2.1 -1 Install required libraries 

In [1]:
# !pip install numpy
# !pip install pandas
# !pip install requests
# !pip install matplotlib

##### 2.1 -2 Import Required libraries

In [34]:
import numpy as np
import pandas as pd
import requests
import zipfile
from io import BytesIO
import matplotlib as plt
from sklearn.model_selection import train_test_split

import plotly.graph_objs as go # bar and pie charts, Plotly
import plotly
import plotly.figure_factory as ff
import plotly.offline as plof

## 2.2 Gathering data

#### 2.2 -1 importing data

In [13]:
# URL of the zipped folder
url = "https://archive.ics.uci.edu/static/public/179/secom.zip"

# download the zipped folder using request library
zip_file = requests.get(url)

# opening the zipped folder
secom_files =  zipfile.ZipFile(BytesIO(zip_file.content), 'r')

# printing the file names inside the zip
secom_files_names =  (zipfile.ZipFile(BytesIO(zip_file.content))).namelist()
print(secom_files_names)

['secom.data', 'secom.names', 'secom_labels.data']


In [14]:
# reading the relevant files 

f_file = secom_files.open('secom.data')
l_file = secom_files.open('secom_labels.data')
n_file = secom_files.open('secom.names')


#### 2.2 -2 labels data

In [15]:
#Create column names and pandas dataframes for labels data

l_column_names=['classifications', 'date_time'] #### create column labels

labels_df = pd.read_csv(l_file, sep=r'\s+', header=None, names= l_column_names)

print(f"Dimensions of Labels data: ", labels_df.shape)

print(labels_df.head(5))

Dimensions of Labels data:  (1567, 2)
   classifications            date_time
0               -1  19/07/2008 11:55:00
1               -1  19/07/2008 12:32:00
2                1  19/07/2008 13:17:00
3               -1  19/07/2008 14:43:00
4               -1  19/07/2008 15:22:00


In [16]:
labels_df.isna().sum()

classifications    0
date_time          0
dtype: int64

In [17]:
# checking the order of data time

labels_df['date_time'] = pd.to_datetime(labels_df['date_time'], format='%d/%m/%Y %H:%M:%S')

is_ascending = labels_df['date_time'].is_monotonic_increasing

if is_ascending:
    print("The timestamps column is in ascending (oldest to newest) order.")
else:
    print("The timestamps column is not in ascending (newest to oldest) order.")

The timestamps column is in ascending (oldest to newest) order.


### 2-2 -3 feature.data

In [18]:
# adding column name as "feature-##"
f_column_names = [f"feature_{i}" for i in range(1, 592)]

features_df = pd.read_csv(f_file, sep=r'\s+', header=None, names= f_column_names)
print(f"Dimensions of Features' data: ", features_df.shape)
print("---------------------------------------------------------")
print(features_df.head())

Dimensions of Features' data:  (1567, 591)
---------------------------------------------------------
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0    3030.93    2564.00  2187.7333  1411.1265     1.3602      100.0   
1    3095.78    2465.14  2230.4222  1463.6606     0.8294      100.0   
2    2932.61    2559.94  2186.4111  1698.0172     1.5102      100.0   
3    2988.72    2479.90  2199.0333   909.7926     1.3204      100.0   
4    3032.24    2502.87  2233.3667  1326.5200     1.5334      100.0   

   feature_7  feature_8  feature_9  feature_10  ...  feature_582  feature_583  \
0    97.6133     0.1242     1.5005      0.0162  ...          NaN       0.5005   
1   102.3433     0.1247     1.4966     -0.0005  ...     208.2045       0.5019   
2    95.4878     0.1241     1.4436      0.0041  ...      82.8602       0.4958   
3   104.2367     0.1217     1.4882     -0.0124  ...      73.8432       0.4990   
4   100.3967     0.1235     1.5031     -0.0031  ...          NaN   

### 2-2 -4 merging

In [50]:
# merging 

merged_df = pd.merge(labels_df, features_df, left_index=True, right_index=True)

print(f"Dimensions of merged data: ", merged_df.shape)

print(merged_df.head())


Dimensions of merged data:  (1567, 593)
   classifications           date_time  feature_1  feature_2  feature_3  \
0               -1 2008-07-19 11:55:00    3030.93    2564.00  2187.7333   
1               -1 2008-07-19 12:32:00    3095.78    2465.14  2230.4222   
2                1 2008-07-19 13:17:00    2932.61    2559.94  2186.4111   
3               -1 2008-07-19 14:43:00    2988.72    2479.90  2199.0333   
4               -1 2008-07-19 15:22:00    3032.24    2502.87  2233.3667   

   feature_4  feature_5  feature_6  feature_7  feature_8  ...  feature_582  \
0  1411.1265     1.3602      100.0    97.6133     0.1242  ...          NaN   
1  1463.6606     0.8294      100.0   102.3433     0.1247  ...     208.2045   
2  1698.0172     1.5102      100.0    95.4878     0.1241  ...      82.8602   
3   909.7926     1.3204      100.0   104.2367     0.1217  ...      73.8432   
4  1326.5200     1.5334      100.0   100.3967     0.1235  ...          NaN   

   feature_583  feature_584  feature_585

In [47]:
merged_data = merged_df.drop(columns = "date_time", axis=1)
merged_data

Unnamed: 0,classifications,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_582,feature_583,feature_584,feature_585,feature_586,feature_587,feature_588,feature_589,feature_590,feature_591
0,-1,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.3630,,,,,
1,-1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045,
2,1,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,
3,-1,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,
4,-1,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,-1,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,...,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720,
1563,-1,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,...,,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720,
1564,-1,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,,...,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231,
1565,-1,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,...,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941,


## 2-3 Data Visualisation

In [30]:
print(merged_data.isna().sum())
print("\n===============================================================\n")
print(merged_data.describe().describe())


classifications       0
feature_1             6
feature_2             7
feature_3            14
feature_4            14
                   ... 
feature_587           1
feature_588           1
feature_589           1
feature_590           1
feature_591        1567
Length: 592, dtype: int64


       classifications    feature_1    feature_2    feature_3    feature_4  \
count         8.000000     8.000000     8.000000     8.000000     8.000000   
mean        195.453844  2472.883085  2078.990367  1844.894221  1383.052958   
std         554.188900  1109.001613   891.252031   770.362628  1096.086719   
min          -1.000000    73.621787    80.407705    29.513152     0.000000   
25%          -1.000000  2447.680000  2009.062500  1933.745000   921.829760   
50%          -0.933631  2988.875000  2474.048865  2190.795859  1340.795514   
75%           0.623508  3025.002172  2509.259375  2205.313900  1562.555875   
max        1567.000000  3356.350000  2846.440000  2315.266700  3715.041700   

     

### 2-3 -1 Target value

In [35]:
# import plotly.offline as plof
# import plotly.graph_objs as go

result_fail = merged_data[merged_data['classifications'] != -1]
result_pass = merged_data[merged_data['classifications'] == -1]

print(f"The number of failure: {len(result_fail)} \nThe number of pass:    {len(result_pass)}")

trace = go.Bar(x = (len(result_fail), len(result_pass)), 
               y = ['Fail ', 'Pass '], 
               orientation = 'h', 
               opacity = 0.8, 
               marker=dict(
                       color=[ 'grey', 'green'],
                       line=dict(color='#000000',width=1.5)
                       )
               )

layout = dict(title =  'Count of target variable for faile and pass data')
                    
fig = dict(data = [trace], layout=layout)
plotly.offline.iplot(fig)


#Then, we will plot the percentage
trace = go.Pie(labels = ['Pass ', 'Fail '], 
               values = labels_df['classifications'].value_counts(), 
               textfont=dict(size=30), opacity = 0.8,
               marker=dict(colors=['green', 'grey'], 
                           line=dict(color='#000000', 
                                     width=1.5)
                           )
               )


layout = dict(title =  'Count of target variable for faile and pass data')
           
fig = dict(data = [trace], layout=layout)
plof.offline.iplot(fig)

The number of failure: 104 
The number of pass:    1463


### 2-3 -2 Other Variables / Features 

In [57]:
# missing values

merged_noNull = merged_data.dropna(axis=1)

print(merged_data.shape)
print(merged_noNull.shape) 


(1567, 592)
(1567, 53)


In [60]:
merged_noNull.head(2)

Unnamed: 0,classifications,feature_21,feature_87,feature_88,feature_89,feature_114,feature_115,feature_116,feature_117,feature_118,...,feature_527,feature_528,feature_571,feature_572,feature_573,feature_574,feature_575,feature_576,feature_577,feature_578
0,-1,1.4026,2.3895,0.969,1747.6049,0.946,0.0,748.6115,0.9908,58.4306,...,0.5064,6.6926,533.85,2.1113,8.95,0.3157,3.0624,0.1026,1.6765,14.9509
1,-1,1.3825,2.3754,0.9894,1931.6464,0.9425,0.0,731.2517,0.9902,58.668,...,0.8832,8.837,535.0164,2.4335,5.92,0.2653,2.0111,0.0772,1.1065,10.9003


In [68]:
col_name = merged_noNull.columns.tolist()
calssifications = col_name.pop(0)
col_name

['feature_21',
 'feature_87',
 'feature_88',
 'feature_89',
 'feature_114',
 'feature_115',
 'feature_116',
 'feature_117',
 'feature_118',
 'feature_120',
 'feature_121',
 'feature_157',
 'feature_222',
 'feature_223',
 'feature_224',
 'feature_249',
 'feature_250',
 'feature_251',
 'feature_252',
 'feature_253',
 'feature_255',
 'feature_256',
 'feature_292',
 'feature_360',
 'feature_361',
 'feature_362',
 'feature_387',
 'feature_388',
 'feature_389',
 'feature_390',
 'feature_391',
 'feature_393',
 'feature_394',
 'feature_430',
 'feature_494',
 'feature_495',
 'feature_496',
 'feature_521',
 'feature_522',
 'feature_523',
 'feature_524',
 'feature_525',
 'feature_527',
 'feature_528',
 'feature_571',
 'feature_572',
 'feature_573',
 'feature_574',
 'feature_575',
 'feature_576',
 'feature_577',
 'feature_578']

In [78]:
# import plotly.figure_factory as ff
# import plotly.offline as plof

def plot_distribution(data_select, size_bin) :  
    tmp1 = result_fail[data_select]
    tmp2 = result_pass[data_select]
    hist_data = [tmp1, tmp2]
    
    group_labels = ['fail', 'pass']
    colors = ['#228C22', '#0000FF']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = size_bin, curve_type='kde')
    
    fig['layout'].update(title = data_select, xaxis_title='Observed Value', yaxis_title="frequency",)

    plof.iplot(fig, filename = 'Density plot for faile and pass')

In [79]:
plot_distribution('feature_21', 1)

In [70]:
# for col in col_name :
#   plot_distribution(col, .5)

#### 2.2 -3 SECOM.data / features data

In [None]:
features_df = secom_features_df.copy()
features_df["Missing Value"] = features_df.isna().sum()
features_df["Missing Value"]

# 3.0 DATA PREPARATION

#### 3.1 Merge the Features and Labels data frames

#### 3.2 splitting training and test dataset

In [None]:
# # separating the merged df into pass and fail    #### preparing for constrained data splitting 

# failed_chip = merged_df[merged_df["classifications"] == 1]
# passed_chip = merged_df[merged_df["classifications"] == -1]

# print(f"passed chips are: {len(passed_chip)} \nfailed chips are: {len(failed_chip)}")


In [None]:
# # splitting the failed into 25% and 75% 

# train_data_failed, test_data_failed = train_test_split(failed_chip, test_size=0.2, random_state=42)

# print(f"failed chips for training are: {len(train_data_failed)} \nfailed chips for testing are:  {len(test_data_failed)}")
# print(f"the percentage of the testing is {round( len(test_data_failed) / len(train_data_failed ), 4) * 100} %")

In [None]:
# splitting the passed into 25% and 75% 

train_data_passed, test_data_passed = train_test_split(passed_chip, test_size=0.2, random_state=42)

print(f"passed chips for training are: {len(train_data_passed)} \npassed chips for testing are:  {len(test_data_passed)}")
print(f"the percentage of the testing is {round( len(test_data_passed) / len(train_data_passed ), 3) * 100} %")

In [None]:
# combining the train_failed and train_passed
training_df = pd.concat([train_data_failed, train_data_passed])
### counting the num of rows to check if it's properly combined 
print(len(training_df))
print(f"check: train_failed {len(train_data_failed)} + train_passed {len(train_data_passed)} = {len(train_data_failed) + len(train_data_passed)}")

print("---------------------------------------------------------------")

# combining the testing_failed and testing_passed  
testing_df = pd.concat([test_data_failed, test_data_passed])
### counting the num of rows to check if it's properly combined 
print(len(testing_df))
print(f"check: train_failed {len(test_data_failed)} + train_passed {len(test_data_passed)} = {len(test_data_failed) + len(test_data_passed)}")



In [None]:
missing_list = secom_features_df.isnull().sum()
missing_list = missing_list.sort_values(ascending=False)
missing_count = missing_list.value_counts(dropna=False)
print(f"there are {len(missing_count)} columns that contain missing value")
plt.hist(missing_count, bins=30)
plt.title('Histogram of missing value by column')
plt.xlabel('number of missing rows')
plt.ylabel('count')
plt.show()

In [None]:
missing_df = pd.DataFrame({"counts": missing_count[:-1]})
# missing_df["num_missing_rows"] = missing_df.index
# missing_df = missing_df[["num_missing_rows", "counts"]].reset_index()
# missing_df.drop(columns=["index"])
# missing_df = missing_df[["num_missing_rows", "counts"]]
missing_df

In [None]:
missing_list.value_counts()

# Histogram 

In [None]:
import matplotlib.pyplot as plt
# Calculate volatilities (standard deviations) for each column
volatilities = secom_features_df.std()

# Plot histogram
plt.figure(figsize=(8, 6))
plt.hist(volatilities, bins=100, color='skyblue', edgecolor='black')
plt.xlabel('Standard Deviation of Features')
plt.ylabel('Frequency')
plt.title('Histogram of Volatilities')
plt.grid(False)
plt.show()