# Prepration of datasets

In [50]:
import os
import numpy as np
import pandas as pd
import scipy.io

## Load datafiles
- First, load the data files from `../dataset/train-dataset/` folder.
- Then select the necessary columns and convert them to pandas dataframe.

In [51]:
dataset_path = "../dataset/train-dataset/" # Define the path to the training set
file_paths = []

for file in os.listdir(dataset_path):
    # Process each file in the dataset
    file_path = os.path.join(dataset_path, file)
    data = scipy.io.loadmat(f'{file_path}')
    file_paths.append(file_path)
    
    print(list(data.keys()))

['__header__', '__version__', '__globals__', 'X097_DE_time', 'X097_FE_time', 'X097RPM']
['__header__', '__version__', '__globals__', 'X105_DE_time', 'X105_FE_time', 'X105_BA_time', 'X105RPM']
['__header__', '__version__', '__globals__', 'X118_DE_time', 'X118_FE_time', 'X118_BA_time', 'X118RPM']
['__header__', '__version__', '__globals__', 'X130_DE_time', 'X130_FE_time', 'X130_BA_time', 'X130RPM']
['__header__', '__version__', '__globals__', 'X169_DE_time', 'X169_FE_time', 'X169_BA_time', 'X169RPM']
['__header__', '__version__', '__globals__', 'X185_DE_time', 'X185_FE_time', 'X185_BA_time', 'X185RPM']
['__header__', '__version__', '__globals__', 'X197_DE_time', 'X197_FE_time', 'X197_BA_time', 'X197RPM']
['__header__', '__version__', '__globals__', 'X209_DE_time', 'X209_FE_time', 'X209_BA_time', 'X209RPM']
['__header__', '__version__', '__globals__', 'X222_DE_time', 'X222_FE_time', 'X222_BA_time', 'X222RPM']
['__header__', '__version__', '__globals__', 'X234_DE_time', 'X234_FE_time', 'X2

In [52]:
data_columns = ['X097_DE_time', 'X105_DE_time', 'X118_DE_time', 'X130_DE_time', 'X169_DE_time',
                'X185_DE_time', 'X197_DE_time', 'X209_DE_time', 'X222_DE_time', 'X234_DE_time']
                
columns_name = ['de_normal','de_7_inner','de_7_ball','de_7_outer','de_14_inner','de_14_ball','de_14_outer','de_21_inner','de_21_ball','de_21_outer']

In [53]:
save_file_path = 'files'

In [54]:
data_12k_10c = pd.DataFrame()   
for index in range(10):
    data = scipy.io.loadmat(file_paths[index])
    dataList = data[data_columns[index]].reshape(-1)
    data_12k_10c[columns_name[index]] = dataList[:119808] 

data_12k_10c.set_index('de_normal',inplace=True)

if not os.path.exists(save_file_path):
    os.makedirs(save_file_path)

data_12k_10c.to_csv(save_file_path + '/' + 'data_12k_10c.csv')
data_12k_10c

Unnamed: 0_level_0,de_7_inner,de_7_ball,de_7_outer,de_14_inner,de_14_ball,de_14_outer,de_21_inner,de_21_ball,de_21_outer
de_normal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.053197,-0.083004,-0.002761,0.008528,-0.223836,-0.467813,0.002274,1.189431,-0.007959,0.104365
0.088662,-0.195734,-0.096324,0.423550,-0.209541,0.179004,-0.104948,-0.177866,0.025340,0.017462
0.099718,0.233419,0.113705,0.012995,0.345337,0.481295,0.082010,-0.774816,0.000162,0.116547
0.058621,0.103958,0.257297,-0.265175,0.158862,-0.158212,0.094027,0.501518,0.092913,0.371164
-0.004590,-0.181115,-0.058314,0.237155,-0.206617,-0.326819,-0.160081,0.993697,-0.007797,0.356951
...,...,...,...,...,...,...,...,...,...
0.002712,0.046781,-0.239592,-0.122232,0.046781,-0.002274,-0.056920,-0.296850,0.018842,-0.024365
0.016689,-0.071309,0.013482,-1.248720,-0.090314,-0.042883,0.048475,0.060913,-0.075370,-0.044264
0.008762,-0.175917,0.217663,0.587609,0.026639,-0.027289,0.107100,0.148628,0.032162,0.042639
-0.034004,-0.290759,-0.016081,0.352890,0.097136,-0.035736,-0.033332,-0.139288,0.152364,-0.059289


## Generate Train, Validation and Test Datasets, Including Labels

In [55]:
from joblib import dump, load

In [56]:
def split_data(data, time_steps, label, overlap_ratio=0.5):
    """
        :Params
        data: 1-D array or list
        time_steps: int, the number of time steps in each sample
        label: int, the label of the sample
        overlap_ratio: float, the overlap ratio of the samples
    """
    
    stride = int(time_steps * (1 - overlap_ratio))  
    samples = (len(data) - time_steps) // stride + 1  
    clasiffy_dataFrame = pd.DataFrame(columns=[x for x in range(time_steps + 1)])  
    data_list = []
    for i in range(samples):
        start_idx = i * stride
        end_idx = start_idx + time_steps
        temp_data = data[start_idx:end_idx].tolist()
        temp_data.append(label) 
        data_list.append(temp_data)
    clasiffy_dataFrame = pd.DataFrame(data_list, columns=clasiffy_dataFrame.columns)
    return clasiffy_dataFrame

In [57]:
def normalize(x):
    """
        :params
            x: input data
    """
    y = (x - np.min(x)) / (np.max(x) - np.min(x))
    return y

In [58]:
def generate_datasets(data_file,
                      split_ratio=[0.7, 0.2, 0.1],
                      step=512,
                      ratio=0.5):
    """
        :param
        data_file: csv file
        split_ratio: train, val, test
        step: time step
        ratio: overlap ratio
        
        :return
        train_data
        val_data
        test_data
    """
    origin_data = pd.read_csv(data_file)

    dataframes = []
    label = 0
    
    for col_name, col_data in origin_data.items():
        # 1. Normalization (optional)
        # col_data = normalize(col_data)
        splited_data = split_data(col_data,
                                  step,
                                  label,
                                  ratio)
        
        label += 1
        dataframes.append(splited_data)
        
    all_data = pd.concat(dataframes, ignore_index=True)

    # randomize the sequence of the data
    all_data = all_data.sample(frac=1).reset_index(drop=True)
    
    # split the data
    train_data = all_data.iloc[:int(len(all_data) * split_ratio[0])]
    val_data = all_data.iloc[int(len(all_data) * split_ratio[0]):int(len(all_data) * (split_ratio[0] + split_ratio[1]))]
    test_data = all_data.iloc[int(len(all_data) * (split_ratio[0] + split_ratio[1])):]
    
    return train_data, val_data, test_data, all_data

In [59]:
split_rate = [0.7, 0.2, 0.1]
raw_data_file_csv = 'files/data_12k_10c.csv'

train_set, val_set, test_set, all_data = generate_datasets(raw_data_file_csv, 
                                                               split_rate)

dump(train_set, save_file_path + '/' + 'train_set') 
dump(val_set, save_file_path + '/' + 'val_set') 
dump(test_set, save_file_path + '/' + 'test_set') 

['files/test_set']

In [60]:
all_data.shape

(4670, 513)

In [61]:
import numpy as np
from joblib import dump, load

# 制作数据集和标签
def make_data_labels(dataframe):
    '''
        参数 dataframe: 数据框
        返回 x_data: 数据集     numpy.ndarray
            y_label: 对应标签值  numpy.ndarray
    '''
    # 信号值
    x_data = dataframe.iloc[:, 0:-1].values
    # 标签值
    y_label = dataframe.iloc[:, -1].values
    x_data = x_data.astype(np.float32)
    y_label = y_label.astype(np.int64)  # 指定了这些张量的数据类型为64位整数，通常用于分类任务的类别标签
    return x_data, y_label

# Load datas and generate labels
train_set = load(save_file_path + '/train_set') 
val_set = load(save_file_path + '/val_set') 
test_set = load(save_file_path + '/test_set') 

# 制作标签
train_xdata, train_ylabel = make_data_labels(train_set)
val_xdata, val_ylabel = make_data_labels(val_set)
test_xdata, test_ylabel = make_data_labels(test_set)

# 保存数据
dump(train_xdata, save_file_path + '/trainX_512_10c')
dump(val_xdata, save_file_path + '/valX_512_10c')
dump(test_xdata, save_file_path + '/testX_512_10c')
dump(train_ylabel, save_file_path + '/trainY_512_10c')
dump(val_ylabel, save_file_path + '/valY_512_10c')
dump(test_ylabel, save_file_path + '/testY_512_10c')


['files/testY_512_10c']

In [62]:
train_xdata.shape

(3269, 512)

In [63]:
assert train_xdata.shape[0] == train_ylabel.shape[0]
assert val_xdata.shape[0] == val_ylabel.shape[0]
assert test_xdata.shape[0] == test_ylabel.shape[0]

print('Compatibility check is complete!')

Compatibility check is complete!
