In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler

2024-02-28 15:30:28.498262: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 15:30:28.498319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 15:30:28.545017: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-28 15:30:28.638142: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.keras.backend.clear_session()

In [3]:
NUM_CLASESS = 6
NUM_FILES = 100
BATCH_SIZE = 2

In [4]:
def convert_parquet_to_npy(input_folder, output_folder):
    npy_output_folder = os.path.join(output_folder, 'npy_data')
    
    # Ensure the output directory exists
    os.makedirs(npy_output_folder, exist_ok=True)
    
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.parquet'):
                parquet_path = os.path.join(root, file)
                df = pd.read_parquet(parquet_path)
                eeg_data = df.to_numpy()
                relative_path = os.path.relpath(parquet_path, input_folder)
                
                # Create the corresponding directory structure in the npy_data folder
                output_subfolder = os.path.join(npy_output_folder, os.path.dirname(relative_path))
                os.makedirs(output_subfolder, exist_ok=True)
                np.save(os.path.join(output_subfolder, file.replace('.parquet', '.npy')), eeg_data)

def read_data(data_folder, num_files=None):
    """
    Read EEG data from .npy files in the specified data folder.

    Parameters:
    - data_folder (str): Path to the main data folder containing 'train' and 'test' subfolders.
    - num_files (int or None): Number of files to read from each subfolder. If None, all files will be read.

    Returns:
    - train (array[Tuple[np.ndarray, np.ndarray]]): List of tuples containing train EEG data.
    - test (array[Tuple[np.ndarray, np.ndarray]]): List of tuples containing test EEG data.
    - train_labels (pd.DataFrame): DataFrame containing train labels.
    - test_labels (pd.DataFrame): DataFrame containing test labels.
    """
    train_eeg_folder = os.path.join(data_folder, 'train_eegs')
    test_eeg_folder = os.path.join(data_folder, 'test_eegs')

    def read_npy_folder(folder_path, n_files=None):
        arrays = []
        files_to_read = os.listdir(folder_path)[:n_files] if n_files else os.listdir(folder_path)
        for file in files_to_read:
            if file.endswith('.npy'):
                file_path = os.path.join(folder_path, file)
                array = np.load(file_path)
                arrays.append(array)
        print(f"Read {len(arrays)} files from {folder_path}.")
        return arrays

    # Read EEG data
    train_eeg = read_npy_folder(train_eeg_folder, num_files)
    test_eeg = read_npy_folder(test_eeg_folder)

    train_labels = pd.read_csv(os.path.join(data_folder, 'train.csv'), nrows=num_files)
    test_labels = pd.read_csv(os.path.join(data_folder, 'test.csv'))

    return train_eeg, test_eeg, train_labels, test_labels

In [5]:
def preprocess_eeg(eeg_data):
    # Normalize each EEG signal independently
    normalized_data = []
    for signal in eeg_data:
        scaler = StandardScaler()
        normalized_signal = scaler.fit_transform(signal)
        normalized_data.append(normalized_signal)
    
    # Find the maximum length of EEG signals
    max_length = max(len(signal) for signal in normalized_data)
    
    # Pad each EEG signal to the maximum length with zeros
    padded_data = []
    for signal in normalized_data:
        padded_signal = np.pad(signal, ((0, max_length - len(signal)), (0, 0)), 'constant')
        padded_data.append(padded_signal)
    
    # Convert padded_data to numpy array
    padded_data = np.array(padded_data)
    
    return padded_data

            
def find_max_length(data):
    max_length = 0
    for seq in data:
        max_length = max(max_length, len(seq))
    return max_length

def create_model(input_shape_eeg, num_classes=6):
    """Create a model that can be trained for variable duration
    EEG data.

    Args:
        input_shape_eeg : shape of one EEG sample
        input_shape_spectrogram : shape of one Spectrogram sample
        num_classes : 6 for seizure, lpd, gpd, lrda, grda, other

    Returns:
        keras model
    """
    
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(units=64, input_shape=input_shape_eeg[1:], return_sequences=True),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.LSTM(units=64),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

def lr_schedule(epoch, lr):
    if epoch % 10 == 0 and epoch > 0:
        return lr * 0.9
    return lr

In [6]:
#convert_parquet_to_npy('data', 'data/npy_data')

In [7]:
train,_test,train_labels,_test_labels = read_data('data/npy_data/npy_data',num_files=NUM_FILES)


Read 100 files from data/npy_data/npy_data/train_eegs.
Read 1 files from data/npy_data/npy_data/test_eegs.


In [8]:
#max_length = find_max_length(train)
labels = pd.read_csv('train.csv', nrows=NUM_FILES)
_train = np.array(train)            # fix needed
Xtrain = preprocess_eeg(_train)
# train_data_generator = data_generator(train, labels, max_padding=max_length)
# X_train_batches = []
# y_train_batches = []
# for _ in range(BATCH_SIZE):
#     X_batch, y_batch = next(train_data_generator)
#     X_train_batches.append(X_batch)
#     y_train_batches.append(y_batch)

# X_trainC = np.concatenate(X_train_batches)
# y_trainC = np.concatenate(y_train_batches)

X_train, X_val, y_train, y_val = train_test_split(
    Xtrain, train_labels,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (100,) + inhomogeneous part.

In [None]:

y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
label_encoder = LabelEncoder()
encoded_labels_train = label_encoder.fit_transform(y_train[:, 8])
encoded_labels_val = label_encoder.fit_transform(y_val[:, 8])

y_train = tf.keras.utils.to_categorical(encoded_labels_train, num_classes=NUM_CLASESS).astype('float32')
y_val = tf.keras.utils.to_categorical(encoded_labels_val, num_classes=NUM_CLASESS).astype('float32')

In [None]:
input_shape_eeg = (BATCH_SIZE,94000, 20)
out_shape = NUM_CLASESS

In [None]:
model = create_model(input_shape_eeg, out_shape)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
model.compile(optimizer=optimizer, loss = tf.keras.losses.KLDivergence(), metrics=['accuracy'])


In [None]:
model.fit(X_train, y_train, epochs=100, batch_size=BATCH_SIZE)

Epoch 1/100


2024-02-17 13:13:50.814098: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-02-17 13:13:54.845531: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fb2db0ace10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-17 13:13:54.845566: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-02-17 13:13:54.857089: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1708155834.947353   23950 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
 3/40 [=>............................] - ETA: 3:30 - loss: nan - accuracy: 0.0000e+00

KeyboardInterrupt: 