# 2.0 DATA UNDERSTANDING STAGE

### 2.1 libraries

##### 2.1 -1 Install required libraries 

In [1]:
# !pip install numpy
# !pip install pandas
# !pip install requests
# !pip install matplotlib

##### 2.1 -2 Import Required libraries

In [None]:
import numpy as np
import pandas as pd
import requests
import zipfile
from io import BytesIO
import matplotlib as plt
from sklearn.model_selection import train_test_split


## 2.2 Importing data

#### 2.2 -1 retrieving data

In [22]:
# URL of the zipped folder
url = "https://archive.ics.uci.edu/static/public/179/secom.zip"

# download the zipped folder using request library
zip_file = requests.get(url)

# opening the zipped folder
secom_files =  zipfile.ZipFile(BytesIO(secom_zip_file.content), 'r')

# printing the file names inside the zip
secom_files_names =  (zipfile.ZipFile(BytesIO(secom_zip_file.content))).namelist()
print(secom_files_names)

['secom.data', 'secom.names', 'secom_labels.data']


In [50]:
# reading the relevant files 

f_file = secom_files.open('secom.data')
l_file = secom_files.open('secom_labels.data')

#### 2.2 -2 labels data

In [51]:
#Create column names and pandas dataframes for labels data

l_column_names=['classifications', 'date_time'] #### create column labels

labels_df = pd.read_csv(l_file, sep=r'\s+', header=None, names= l_column_names)

print(f"Dimensions of Labels data: ", labels_df.shape)

print(labels_df.head(5))

Dimensions of Labels data:  (1567, 2)
   classifications            date_time
0               -1  19/07/2008 11:55:00
1               -1  19/07/2008 12:32:00
2                1  19/07/2008 13:17:00
3               -1  19/07/2008 14:43:00
4               -1  19/07/2008 15:22:00


##### 2.2 -2-1 timestamps & checking its chronological order (from oldest to newest)


In [52]:
print(labels_df.tail(5))

      classifications            date_time
1562               -1  16/10/2008 15:13:00
1563               -1  16/10/2008 20:49:00
1564               -1  17/10/2008 05:26:00
1565               -1  17/10/2008 06:01:00
1566               -1  17/10/2008 06:07:00


In [53]:
labels_df['date_time'] = pd.to_datetime(labels_df['date_time'], format='%d/%m/%Y %H:%M:%S')

is_ascending = labels_df['date_time'].is_monotonic_increasing

if is_ascending:
    print("The timestamps column is in ascending (oldest to newest) order.")
else:
    print("The timestamps column is not in ascending (newest to oldest) order.")

The timestamps column is in ascending (oldest to newest) order.


##### 2.2 -2-2 Date columns separation

In [54]:
# # separating date columns 
# t_df = labels_df.copy()
# t_df["time"] = t_df["date_time"].dt.time
# t_df["date"] = pd.to_datetime(t_df["date_time"].dt.date)
# t_df["year"] = t_df["date"].dt.year
# t_df["month"] = t_df["date"].dt.month
# t_df["day"] = t_df["date"].dt.day

# t_df

### checking the test result

In [10]:
result_fail = labels_df["classifications"].value_counts()[1]
result_pass = labels_df["classifications"].value_counts()[-1]

print(f"The number of failure: {result_fail} \nThe number of pass:    {result_pass}")

The number of failure: 104 
The number of pass:    1463


In [11]:
# # splitting the original data between test and training data

# train_data, test_data = train_test_split(labels_df, test_size=0.2, random_state=42)
# display(len(train_data))
# display(len(test_data))

1253

314

In [12]:
# test_percentage = len(test_data)/len(train_data)
# test_percentage

0.25059856344772546

#### 2.2 -3 SECOM.data / features data

In [14]:
f_column_names = [f"feature_{i}" for i in range(1, 592)]

secom_features_df = pd.read_csv(f_file, sep=r'\s+', header=None, names= f_column_names)
print(f"Dimensions of Features' data: ", secom_features_df.shape)
print("---------------------------------------------------------")
print(secom_features_df.head())

Dimensions of Features' data:  (1567, 591)
---------------------------------------------------------
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0    3030.93    2564.00  2187.7333  1411.1265     1.3602      100.0   
1    3095.78    2465.14  2230.4222  1463.6606     0.8294      100.0   
2    2932.61    2559.94  2186.4111  1698.0172     1.5102      100.0   
3    2988.72    2479.90  2199.0333   909.7926     1.3204      100.0   
4    3032.24    2502.87  2233.3667  1326.5200     1.5334      100.0   

   feature_7  feature_8  feature_9  feature_10  ...  feature_582  feature_583  \
0    97.6133     0.1242     1.5005      0.0162  ...          NaN       0.5005   
1   102.3433     0.1247     1.4966     -0.0005  ...     208.2045       0.5019   
2    95.4878     0.1241     1.4436      0.0041  ...      82.8602       0.4958   
3   104.2367     0.1217     1.4882     -0.0124  ...      73.8432       0.4990   
4   100.3967     0.1235     1.5031     -0.0031  ...          NaN   

# 3.0 DATA PREPARATION

#### 3.1 Merge the Features and Labels data frames

In [15]:
secom_merged_df = pd.merge(labels_df, secom_features_df, left_index=True, right_index=True)

print(f"Dimensions of merged data: ", secom_merged_df.shape)

print(secom_merged_df.head())

Dimensions of merged data:  (1567, 593)
   classifications           date_time  feature_1  feature_2  feature_3  \
0               -1 2008-07-19 11:55:00    3030.93    2564.00  2187.7333   
1               -1 2008-07-19 12:32:00    3095.78    2465.14  2230.4222   
2                1 2008-07-19 13:17:00    2932.61    2559.94  2186.4111   
3               -1 2008-07-19 14:43:00    2988.72    2479.90  2199.0333   
4               -1 2008-07-19 15:22:00    3032.24    2502.87  2233.3667   

   feature_4  feature_5  feature_6  feature_7  feature_8  ...  feature_582  \
0  1411.1265     1.3602      100.0    97.6133     0.1242  ...          NaN   
1  1463.6606     0.8294      100.0   102.3433     0.1247  ...     208.2045   
2  1698.0172     1.5102      100.0    95.4878     0.1241  ...      82.8602   
3   909.7926     1.3204      100.0   104.2367     0.1217  ...      73.8432   
4  1326.5200     1.5334      100.0   100.3967     0.1235  ...          NaN   

   feature_583  feature_584  feature_585

In [16]:
# merging 

merged_df = pd.merge(labels_df, secom_features_df, left_index=True, right_index=True)

print(f"Dimensions of merged data: ", merged_df.shape)

print(merged_df.head())


Dimensions of merged data:  (1567, 593)
   classifications           date_time  feature_1  feature_2  feature_3  \
0               -1 2008-07-19 11:55:00    3030.93    2564.00  2187.7333   
1               -1 2008-07-19 12:32:00    3095.78    2465.14  2230.4222   
2                1 2008-07-19 13:17:00    2932.61    2559.94  2186.4111   
3               -1 2008-07-19 14:43:00    2988.72    2479.90  2199.0333   
4               -1 2008-07-19 15:22:00    3032.24    2502.87  2233.3667   

   feature_4  feature_5  feature_6  feature_7  feature_8  ...  feature_582  \
0  1411.1265     1.3602      100.0    97.6133     0.1242  ...          NaN   
1  1463.6606     0.8294      100.0   102.3433     0.1247  ...     208.2045   
2  1698.0172     1.5102      100.0    95.4878     0.1241  ...      82.8602   
3   909.7926     1.3204      100.0   104.2367     0.1217  ...      73.8432   
4  1326.5200     1.5334      100.0   100.3967     0.1235  ...          NaN   

   feature_583  feature_584  feature_585

#### 3.2 splitting training and test dataset

In [17]:
# separating the merged df into pass and fail    #### preparing for constrained data splitting 

failed_chip = merged_df[merged_df["classifications"] == 1]
passed_chip = merged_df[merged_df["classifications"] == -1]

print(f"passed chips are: {len(passed_chip)} \nfailed chips are: {len(failed_chip)}")


passed chips are: 1463 
failed chips are: 104


In [18]:
# splitting the failed into 25% and 75% 

train_data_failed, test_data_failed = train_test_split(failed_chip, test_size=0.2, random_state=42)

print(f"failed chips for training are: {len(train_data_failed)} \nfailed chips for testing are:  {len(test_data_failed)}")
print(f"the percentage of the testing is {round( len(test_data_failed) / len(train_data_failed ), 4) * 100} %")

failed chips for training are: 83 
failed chips for testing are:  21
the percentage of the testing is 25.3 %


In [19]:
# splitting the passed into 25% and 75% 

train_data_passed, test_data_passed = train_test_split(passed_chip, test_size=0.2, random_state=42)

print(f"passed chips for training are: {len(train_data_passed)} \npassed chips for testing are:  {len(test_data_passed)}")
print(f"the percentage of the testing is {round( len(test_data_passed) / len(train_data_passed ), 3) * 100} %")

passed chips for training are: 1170 
passed chips for testing are:  293
the percentage of the testing is 25.0 %


In [20]:
# combining the train_failed and train_passed
training_df = pd.concat([train_data_failed, train_data_passed])
### counting the num of rows to check if it's properly combined 
print(len(training_df))
print(f"check: train_failed {len(train_data_failed)} + train_passed {len(train_data_passed)} = {len(train_data_failed) + len(train_data_passed)}")

print("---------------------------------------------------------------")

# combining the testing_failed and testing_passed  
testing_df = pd.concat([test_data_failed, test_data_passed])
### counting the num of rows to check if it's properly combined 
print(len(testing_df))
print(f"check: train_failed {len(test_data_failed)} + train_passed {len(test_data_passed)} = {len(test_data_failed) + len(test_data_passed)}")



1253
check: train_failed 83 + train_passed 1170 = 1253
---------------------------------------------------------------
314
check: train_failed 21 + train_passed 293 = 314
