In [24]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from pathlib import Path
import re
import json
from tqdm import tqdm

import tensorflow as tf
from keras.models import Sequential
from keras.layers import (
    LSTM,
    Bidirectional,
    Conv1D,
    Dense,
    Flatten,
    Input,
    MaxPooling1D,
    TimeDistributed,
    Dropout
)
from keras.initializers import GlorotUniform
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [2]:
def extract_class(filename):
    match = re.search(r'benchy_\d+_(.*?)\.parquet\.gzip', filename)
    if match:
        return match.group(1)
    return None

In [3]:
def feature_extraction(dataset,label):
    p_data = np.empty((0,9))

    for sample in dataset:
        pca = PCA(n_components=1)
        pca.fit(sample)
        data = np.hstack([np.mean(sample,axis=0), np.std(sample,axis=0), pca.components_[0]])
        p_data = np.vstack((p_data,data))

    label_column = np.ones((p_data.shape[0], 1))*label
    p_data = np.hstack((p_data,label_column))
    return p_data

In [4]:
def window_using_rolling(df,window=44,step=22,columns=["accel_x","accel_y","accel_z"]):
    rolling = df[columns].rolling(window=window,step=step)
    return np.array(list(rolling)[2:])

In [5]:
sample_rate = 3200
print_resolution = 0.1
print_speed = 60
minimum_print_steps = 2
min_print_window = minimum_print_steps*print_resolution/print_speed
samples_per_window = min_print_window*sample_rate
conversion_constant = (2.0*16.0)/8192

print(f"Minimum print window: {min_print_window}")
print(f"Samples per window: {samples_per_window}")

Minimum print window: 0.0033333333333333335
Samples per window: 10.666666666666668


### Processing Data

In [10]:
folder = "TCC_data/processed_dataset/labeled_dataset_downsample_200_filter"

In [11]:
df_all = pd.DataFrame()
list_classes = set()
for file in Path(folder).glob("benchy_*"):
    list_classes.add(extract_class(file.name))
print(list_classes)

# class_index = dict()
# for i,class_name in enumerate(list_classes):
#     class_index[class_name] = int(i)

class_index = {'healthy':0, 'temp_220':1, 'temp_230':2, 'nozzle_03':3,'nozzle_02':4, 'loose_head':5}
print(class_index)

{'nozzle_03', 'temp_230', 'temp_220', 'loose_head', 'nozzle_02', 'healthy'}
{'healthy': 0, 'temp_220': 1, 'temp_230': 2, 'nozzle_03': 3, 'nozzle_02': 4, 'loose_head': 5}


Raw data

In [12]:
# full_data = np.empty((0,10))

# for file in list(Path("downsampled_200").glob("benchy_*")):
#     file_class = extract_class(file.name)
#     df = pd.read_parquet(file)
#     data = window_using_rolling(df)
#     processed_data = process_data(data,class_index[file_class])
#     full_data = np.vstack((full_data,processed_data))

Normalized data

In [48]:
accel_x = np.array([])
accel_y = np.array([])
accel_z = np.array([])
counter = dict()

for file in tqdm(list(Path(folder).glob("benchy_*"))):
    file_class = extract_class(file.name)
    df = pd.read_parquet(file)
    accel_x = np.concatenate([accel_x,df["accel_x"]*conversion_constant])
    accel_y = np.concatenate([accel_y,df["accel_y"]*conversion_constant])
    accel_z = np.concatenate([accel_z,df["accel_z"]*conversion_constant])

mean_accel_x = np.mean(accel_x)
mean_accel_y = np.mean(accel_y)
mean_accel_z = np.mean(accel_z)

std_accel_x = np.std(accel_x)
std_accel_y = np.std(accel_y)
std_accel_z = np.std(accel_z)

100%|██████████| 28/28 [00:03<00:00,  9.00it/s]


In [49]:
timesteps = 10
x_data = np.empty((0,timesteps,9))
y_data = np.empty((0,1))

counter = dict()
for file in tqdm(list(Path(folder).glob("benchy_*"))):
    df = pd.read_parquet(file)
    df["accel_x"] = (df["accel_x"]*conversion_constant-mean_accel_x)/std_accel_x
    df["accel_y"] = (df["accel_y"]*conversion_constant-mean_accel_y)/std_accel_y
    df["accel_z"] = (df["accel_z"]*conversion_constant-mean_accel_z)/std_accel_z

    data = window_using_rolling(df)
    
    unique = df["class"].unique()
    if len(unique) != 1:
        print(f"failure: {file} - {unique}")
        break

    processed_data = feature_extraction(data, class_index[unique[0]])

    processed_data2 = processed_data[processed_data.shape[0]%timesteps:,:9]
    reshaped_data = processed_data2.reshape((
        processed_data2.shape[0] // timesteps,
        timesteps,
        *processed_data2.shape[1:],
    ))

    x_data = np.vstack((x_data,reshaped_data))
    y_data = np.vstack((y_data,np.ones((reshaped_data.shape[0],1))*class_index[unique[0]]))

    d = dict()
    d.update(df["class"].value_counts())
    for key,value in d.items():
        counter[key] = value + counter.get(key,0)

100%|██████████| 28/28 [10:15<00:00, 21.99s/it]


In [50]:
np.save(Path(folder)/"processed_labeled_filtered_normalized_data",x_data,fix_imports=False)
np.save(Path(folder)/"processed_labeled_filtered_normalized_result",y_data,fix_imports=False)

### Training

In [None]:
with open(Path(folder)/'processed_labeled_filtered_normalized_data.npy', 'rb') as f:
    x_data = np.load(f)

with open(Path(folder)/'processed_labeled_filtered_normalized_result.npy', 'rb') as f:
    y_data = np.load(f)

In [52]:
y_data_categorical = to_categorical(y_data,num_classes=np.unique(y_data).shape[0])

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data_categorical,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y_data_categorical)

#### Sequential Model

In [53]:
# Define the model
model = Sequential()

# Add LSTM layer with 100 units
model.add(LSTM(100, input_shape=(10, 9), return_sequences=False, kernel_initializer=GlorotUniform(), activation='relu'))

# Add Dropout layer
model.add(Dropout(0.2))  # Adjust dropout rate as needed

# Add output layer
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["categorical_accuracy","accuracy"])

# Summary of the model
model.summary()

I0000 00:00:1723880191.856012    3160 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-08-17 04:36:31.965187: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
  super().__init__(**kwargs)


In [54]:
checkpoint_callback = ModelCheckpoint(
    filepath='best_vanilla_model.keras',       # Filepath to save the model
    monitor='val_accuracy',            # Monitor validation accuracy
    save_best_only=True,               # Save only the best model
    save_weights_only=False,           # Save the full model (architecture + weights)
    mode='max',                        # Save when the monitored quantity is maximized
    verbose=1                          # Verbosity mode
)

In [55]:
# Train
history = model.fit(
    x_train,
    y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[checkpoint_callback]
    )

Epoch 1/100
[1m2239/2242[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.5944 - categorical_accuracy: 0.5944 - loss: 0.9427
Epoch 1: val_accuracy improved from -inf to 0.66897, saving model to best_vanilla_model.keras
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.5945 - categorical_accuracy: 0.5945 - loss: 0.9425 - val_accuracy: 0.6690 - val_categorical_accuracy: 0.6690 - val_loss: 0.7285
Epoch 2/100
[1m2235/2242[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.6773 - categorical_accuracy: 0.6773 - loss: 0.7215
Epoch 2: val_accuracy improved from 0.66897 to 0.68208, saving model to best_vanilla_model.keras
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.6773 - categorical_accuracy: 0.6773 - loss: 0.7215 - val_accuracy: 0.6821 - val_categorical_accuracy: 0.6821 - val_loss: 0.6936
Epoch 3/100
[1m2237/2242[0m [32m━━━━━━━━━━━━━━━━━━━[0m[3

In [56]:
with open('training_history_vanilla.json', 'w') as f:
    json.dump(history.history, f)

In [57]:
y_pred = model.predict(x_train)

# Convert predictions to class labels
y_pred_classes = np.argmax(y_pred, axis=1)
y_train_classes = np.argmax(y_train, axis=1)

# Calculate precision and recall for multi-class classification
precision = precision_score(y_train_classes, y_pred_classes, average='macro')
recall = recall_score(y_train_classes, y_pred_classes, average='macro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")

[1m   1/2802[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8:01[0m 172ms/step

2024-08-17 05:08:56.200100: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 32271120 exceeds 10% of free system memory.


[1m2802/2802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step
Precision: 0.7800748957326529
Recall: 0.7801189658416857


#### Stacked Model

In [61]:
# Define the model
model_stacked = Sequential()

# Add LSTM layer with 100 units
model_stacked.add(LSTM(50, input_shape=(10, 9), return_sequences=True, kernel_initializer=GlorotUniform(), activation='relu'))
model_stacked.add(LSTM(50, kernel_initializer=GlorotUniform(), activation='relu'))

# Add Dropout layer
model_stacked.add(Dropout(0.2))  # Adjust dropout rate as needed

# Add output layer
model_stacked.add(Dense(6, activation='softmax'))

# Compile the model_stacked
model_stacked.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["categorical_accuracy","accuracy"])

# Summary of the model_stacked
model_stacked.summary()

  super().__init__(**kwargs)


In [62]:
checkpoint_callback = ModelCheckpoint(
    filepath='best_2_stack_model.keras',# Filepath to save the model
    monitor='val_accuracy',            # Monitor validation accuracy
    save_best_only=True,               # Save only the best model
    save_weights_only=False,           # Save the full model (architecture + weights)
    mode='max',                        # Save when the monitored quantity is maximized
    verbose=1                          # Verbosity mode
)

In [63]:
# Train
history2 = model_stacked.fit(
    x_train,
    y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[checkpoint_callback]
)

Epoch 1/100
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5664 - categorical_accuracy: 0.5664 - loss: 0.9970
Epoch 1: val_accuracy improved from -inf to 0.67907, saving model to best_2_stack_model.keras
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.5664 - categorical_accuracy: 0.5664 - loss: 0.9970 - val_accuracy: 0.6791 - val_categorical_accuracy: 0.6791 - val_loss: 0.7138
Epoch 2/100
[1m2241/2242[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.6718 - categorical_accuracy: 0.6718 - loss: 0.7373
Epoch 2: val_accuracy improved from 0.67907 to 0.68108, saving model to best_2_stack_model.keras
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 8ms/step - accuracy: 0.6718 - categorical_accuracy: 0.6718 - loss: 0.7373 - val_accuracy: 0.6811 - val_categorical_accuracy: 0.6811 - val_loss: 0.7021
Epoch 3/100
[1m2241/2242[0m [32m━━━━━━━━━━━━━━━━━━━[0m[3

In [64]:
with open('training_history_stacked.json', 'w') as f:
    json.dump(history2.history, f)

In [65]:
y_pred = model_stacked.predict(x_train)

# Convert predictions to class labels
y_pred_classes = np.argmax(y_pred, axis=1)
y_train_classes = np.argmax(y_train, axis=1)

# Calculate precision and recall for multi-class classification
precision = precision_score(y_train_classes, y_pred_classes, average='macro')
recall = recall_score(y_train_classes, y_pred_classes, average='macro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")

2024-08-17 05:42:52.727664: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 32271120 exceeds 10% of free system memory.


[1m2802/2802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
Precision: 0.7633565676573264
Recall: 0.7404623358398638


#### Bidirectional Model

In [69]:
# Define the model
model_bi = Sequential()

# Add LSTM layer with 100 units
model_bi.add(Bidirectional(LSTM(units=100,kernel_initializer=GlorotUniform(), activation='relu'), input_shape=(10, 9)))
# model_bi.add(Bidirectional(LSTM(100, input_shape=(10, 9), return_sequences=False, kernel_initializer=GlorotUniform(), activation='relu')))

# Add Dropout layer
model_bi.add(Dropout(0.2))  # Adjust dropout rate as needed

# Add output layer
model_bi.add(Dense(6, activation='softmax'))

# Compile the model_bi
model_bi.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["categorical_accuracy","accuracy"])

# Summary of the model_bi
model_bi.summary()

In [70]:
checkpoint_callback = ModelCheckpoint(
    filepath='best_bidirectional_model.keras',# Filepath to save the model
    monitor='val_accuracy',            # Monitor validation accuracy
    save_best_only=True,               # Save only the best model
    save_weights_only=False,           # Save the full model (architecture + weights)
    mode='max',                        # Save when the monitored quantity is maximized
    verbose=1                          # Verbosity mode
)

In [71]:
# Train
history3 = model_bi.fit(
    x_train,
    y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[checkpoint_callback]
)

Epoch 1/100
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6011 - categorical_accuracy: 0.6011 - loss: 0.9100
Epoch 1: val_accuracy improved from -inf to 0.68788, saving model to best_bidirectional_model.keras
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.6011 - categorical_accuracy: 0.6011 - loss: 0.9100 - val_accuracy: 0.6879 - val_categorical_accuracy: 0.6879 - val_loss: 0.6880
Epoch 2/100
[1m2236/2242[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.6802 - categorical_accuracy: 0.6802 - loss: 0.7063
Epoch 2: val_accuracy improved from 0.68788 to 0.69809, saving model to best_bidirectional_model.keras
[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.6802 - categorical_accuracy: 0.6802 - loss: 0.7063 - val_accuracy: 0.6981 - val_categorical_accuracy: 0.6981 - val_loss: 0.6765
Epoch 3/100
[1m2240/2242[0m [32m━━━━━━━━━━━━━━

In [72]:
with open('training_history_bi.json', 'w') as f:
    json.dump(history3.history, f)

In [73]:
y_pred = model_bi.predict(x_train)

# Convert predictions to class labels
y_pred_classes = np.argmax(y_pred, axis=1)
y_train_classes = np.argmax(y_train, axis=1)

# Calculate precision and recall for multi-class classification
precision = precision_score(y_train_classes, y_pred_classes, average='macro')
recall = recall_score(y_train_classes, y_pred_classes, average='macro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")

2024-08-17 06:19:16.926449: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 32271120 exceeds 10% of free system memory.


[1m2802/2802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step
Precision: 0.8140110131722299
Recall: 0.807850527970715
