### This notebook aims to process the other files in a similar way to the Train_0D and 0E data

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [32]:
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

### As the files are large, each file below are loaded individually to clean the dataset before restarting the kernel to run again (to clear up the memory)

In [33]:
#train_1D = pd.read_csv('train_1D_index.csv')
#train_2D = pd.read_csv('train_2D_index.csv')
#train_3D = pd.read_csv('train_3D_index.csv')
#train_4D = pd.read_csv('train_4D_index.csv')
#eval_1E = pd.read_csv('eval_1E_index.csv')
#eval_2E = pd.read_csv('eval_2E_index.csv')
#eval_3E = pd.read_csv('eval_3E_index.csv')
eval_4E = pd.read_csv('eval_4E_index.csv')

In [34]:
def data_cleaning(data):
    print("Original data size:", len(data))
    data_cut = data.iloc[53247:]
    print("Data size after initial cut:", len(data_cut))
    
    if data_cut.empty:
        return pd.DataFrame()  # Return an empty DataFrame immediately if data_cut is empty
    
    data_cut['Time_Index_Label'] =  data_cut.groupby('V_in').cumcount() // 4096 + 1
    data_cut['Time_Index_Label'] =  data_cut.apply(lambda x: f"{x['V_in']}_{int(x['Time_Index_Label'])}S", axis=1)
    
    seconds_ranges = list(range(5, 16)) + list(range(25, 36))
    Time_index_list = data_cut['Time_Index_Label'].tolist()
    suffix = tuple(f"_{i}S" for i in seconds_ranges)
    
    filtered_index = [item for item in Time_index_list if item.endswith(suffix)]
    data_clean = data_cut[data_cut['Time_Index_Label'].isin(filtered_index)]
    
    print("Data size after filtering:", len(data_clean))
    return data_clean


In [35]:
eval_4E_cut = data_cleaning(eval_4E)

Original data size: 6914047
Data size after initial cut: 6860800


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cut['Time_Index_Label'] =  data_cut.groupby('V_in').cumcount() // 4096 + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cut['Time_Index_Label'] =  data_cut.apply(lambda x: f"{x['V_in']}_{int(x['Time_Index_Label'])}S", axis=1)


Data size after filtering: 3784704


In [36]:
eval_4E_cut.shape

(3784704, 7)

In [37]:
eval_4E_cut.head()

Unnamed: 0,row_index,V_in,Measured_RPM,Vibration_1,Vibration_2,Vibration_3,Time_Index_Label
69631,69632,4.0,1028.0046,0.00222,0.008391,0.006852,4.0_5S
69632,69633,4.0,1028.0046,0.001427,0.003248,0.008587,4.0_5S
69633,69634,4.0,1028.0046,0.004427,0.003867,0.007498,4.0_5S
69634,69635,4.0,1028.0046,0.002619,0.012602,0.005075,4.0_5S
69635,69636,4.0,1028.0046,0.004669,0.006075,0.005367,4.0_5S


In [38]:
eval_4E_cut.tail()

Unnamed: 0,row_index,V_in,Measured_RPM,Vibration_1,Vibration_2,Vibration_3,Time_Index_Label
6914042,6914043,4.0,1080.4582,0.002939,-0.003955,0.002704,4.0_35S
6914043,6914044,4.0,1080.4582,-0.000345,0.002913,0.000757,4.0_35S
6914044,6914045,4.0,1080.4582,-0.003408,0.002537,-0.001725,4.0_35S
6914045,6914046,4.0,1080.4582,-0.004315,-0.002156,-0.000253,4.0_35S
6914046,6914047,4.0,1080.4582,-0.006704,-0.000309,0.002162,4.0_35S


In [39]:
eval_4E_cut.groupby("Time_Index_Label").size()

Time_Index_Label
4.0_10S    4096
4.0_11S    4096
4.0_12S    4096
4.0_13S    4096
4.0_14S    4096
4.0_15S    4096
4.0_25S    4096
4.0_26S    4096
4.0_27S    4096
4.0_28S    4096
4.0_29S    4096
4.0_30S    4096
4.0_31S    4096
4.0_32S    4096
4.0_33S    4096
4.0_34S    4096
4.0_35S    4096
4.0_5S     4096
4.0_6S     4096
4.0_7S     4096
4.0_8S     4096
4.0_9S     4096
4.1_10S    4096
4.1_11S    4096
4.1_12S    4096
4.1_13S    4096
4.1_14S    4096
4.1_15S    4096
4.1_25S    4096
4.1_26S    4096
4.1_27S    4096
4.1_28S    4096
4.1_29S    4096
4.1_30S    4096
4.1_31S    4096
4.1_32S    4096
4.1_33S    4096
4.1_34S    4096
4.1_35S    4096
4.1_5S     4096
4.1_6S     4096
4.1_7S     4096
4.1_8S     4096
4.1_9S     4096
4.2_10S    4096
4.2_11S    4096
4.2_12S    4096
4.2_13S    4096
4.2_14S    4096
4.2_15S    4096
4.2_25S    4096
4.2_26S    4096
4.2_27S    4096
4.2_28S    4096
4.2_29S    4096
4.2_30S    4096
4.2_31S    4096
4.2_32S    4096
4.2_33S    4096
4.2_34S    4096
4.2_35S    4096
4.2_5S 

In [40]:
eval_4E_cut.to_csv('eval_4E_cleaned.csv')