<a href="https://colab.research.google.com/github/Lou1108/DeepLearning/blob/main/Assignment2/meg_prediction_sofia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



What we need to do
*   do the pre processing seperately and save the data
*   check and try different models to see if there's one working better with the data
* grid search on the number of the neurons in the layers



### Assignment 2

# Imports and Variables


In [78]:
import os
import glob
import h5py
import numpy as np
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.signal import butter, filtfilt, decimate

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Input,
    Conv1D,
    Conv2D,
    DepthwiseConv2D,
    SeparableConv2D,
    MaxPooling1D,
    AveragePooling2D,
    LSTM,
    Flatten,
    Dense,
    Dropout,
    BatchNormalization,
    Activation,
)


In [87]:
TRAIN_PATH = "/content/drive/MyDrive/Deep Learning/Final Project data/Intra/train"
TEST_PATH = "/content/drive/MyDrive/Deep Learning/Final Project data/Intra/test"


# data specific
NUM_CHANNELS = 248
NUM_CLASSES = 4
LABEL_MAP = {'rest':0, 'task_motor':1, 'task_story_math':2, 'task_working_memory':3}
NUM_CLASSES = len(LABEL_MAP)
orig_fs=2034
target_fs=250
DOWNSAMPLE_FACTOR =  int(orig_fs / target_fs)

# Model specific
NUM_EPOCHS = 1
NORMALIZATION_METHOD = "minmax"  # Choose: "minmax", "zscore", or "perchannel"

In [64]:
def load_data(file_paths):
    data = []
    labels = []
    for file_path in file_paths:
        # Extractin the label
        filename = file_path.split('/')[-1]

        #handling the different task naming conventions
        if 'rest' in filename:
            labels.append(LABEL_MAP['rest'])
        elif 'motor' in filename:
            labels.append(LABEL_MAP['task_motor'])
        elif 'story' in filename or 'math' in filename:
             labels.append(LABEL_MAP['task_story_math'])
        elif 'working' in filename or 'memory' in filename:
            labels.append(LABEL_MAP['task_working_memory'])
        else:
            # iff a file doesn't match
            print(f"Could not determine task for file: {filename}")
            continue

        with h5py.File(file_path, 'r') as f:
            # Instead of guessing the dataset name, we get the first key from the file
            # This is robust because we know there is only one dataset per file[cite: 10].
            dataset_name = list(f.keys())[0]
            matrix = f[dataset_name][()]
            data.append(matrix)

    #convert to numpy arrays
    return np.array(data), np.array(labels)

In [67]:
train_files = glob.glob(f"{TRAIN_PATH}/*.h5")
test_files = glob.glob(f"{TEST_PATH}/*.h5")

X_train, y_train = load_data(train_files)
X_test, y_test = load_data(test_files)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Uniqe labels: {np.unique(y_train)}")
print(f"Number of training samples: {len(X_train)}")

Shape of X_train: (32, 248, 35624)
Shape of y_train: (32,)
Shape of X_test: (8, 248, 35624)
Uniqe labels: [0 1 2 3]
Number of training samples: 32


# Load and Preprocess Data
Apply a lowpass filter for downsampling the frequency

In [79]:
# lowpass filter ---> check it
# def bandpass_filter(data, lowcut=1.0, highcut=150.0, fs=2034, order=5):
#     nyq = 0.5 * fs
#     low = lowcut / nyq
#     high = highcut / nyq
#     b, a = butter(order, [low, high], btype='band')
#     return filtfilt(b, a, data, axis=-1)

Normalization functions

In [80]:
def min_max_scale_sample(data):
    scaler = MinMaxScaler()
    return scaler.fit_transform(data.reshape(-1, 1)).reshape(data.shape)

def z_score_normalize(data):
    mean = data.mean(axis=-1, keepdims=True)
    std = data.std(axis=-1, keepdims=True)
    return (data - mean) / (std + 1e-8)

def time_norm(data):
    n_samples, n_channels, n_timesteps = data.shape
    reshaped_data = data.reshape(n_samples * n_channels, n_timesteps)

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(reshaped_data)

    # Reshape back to the original shape
    return scaled_data.reshape(n_samples, n_channels, n_timesteps)

In [82]:
def normalization(data):
    if NORMALIZATION_METHOD == "minmax":
        data = min_max_scale_sample(data)
    elif NORMALIZATION_METHOD == "zscore":
        data = z_score_normalize(data)
    elif NORMALIZATION_METHOD == "time":
        data = time_norm(data)
    return data

In [83]:
N_TIMESTEPS = X_train.shape[2]
X_train_ds=decimate(X_train, DOWNSAMPLE_FACTOR, axis=-1, ftype='fir', zero_phase=True)
X_test_ds=decimate(X_test, DOWNSAMPLE_FACTOR, axis=-1, ftype='fir', zero_phase=True)
N_TIMESTEPS_DS = X_train_ds.shape[2]

print(f"Original number of time steps: {N_TIMESTEPS}")
print(f"Downsampled number of time steps: {N_TIMESTEPS_DS}")

Original number of time steps: 35624
Downsampled number of time steps: 4453


In [84]:
X_train_norm = normalization(X_train_ds)
X_test_norm = normalization(X_test_ds)


Normalizing data...


In [85]:
#DL models in Keras often expect the channel dimension last
#reshaping from (samples, channels, timesteps) to (samples, timesteps, channels)
X_train_final = np.transpose(X_train_norm, (0, 2, 1))
X_test_final = np.transpose(X_test_norm, (0, 2, 1))

print(f"Final shape of training data for the model: {X_train_final.shape}")

Final shape of training data for the model: (32, 4453, 248)


# Define Models

###CNN model

In [86]:
def build_cnn_model(input_shape, num_classes):
    model = Sequential([
        Input(shape=input_shape),

        #1st convolutional block
        Conv1D(filters=64, kernel_size=10, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=4),

        #2nd convolutional block
        Conv1D(filters=128, kernel_size=10, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=4),

        # 3rd convolutional block
        Conv1D(filters=256, kernel_size=10, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=4),

        # Flatten the features and feed to dense layers
        Flatten(),

        # dense layers for classification
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    return model

In [89]:
INPUT_SHAPE = (N_TIMESTEPS_DS, NUM_CHANNELS)

cnn_model = build_cnn_model(INPUT_SHAPE, NUM_CLASSES)
cnn_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy', # Use sparse CE because our labels are integers
    metrics=['accuracy']
)
cnn_model.summary()

Training and evaluation of CNN

In [92]:
history_cnn = cnn_model.fit(
    X_train_final,
    y_train,
    epochs=10,
    batch_size=16      # smaller batch size for better generalization
    # validation_split=0.2 # 20% of training data for validation
)

# Evaluate on test Sets
loss_cnn, acc_cnn = cnn_model.evaluate(X_test_final, y_test, verbose=0)
print(f"accuracy on test set: {acc_cnn * 100:.2f}%")

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.9375 - loss: 5.1715
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9583 - loss: 1.4737
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.8958 - loss: 2.0566
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step - accuracy: 0.9583 - loss: 0.3474
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9792 - loss: 0.4446
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step - accuracy: 0.9167 - loss: 0.5780
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step - accuracy: 1.0000 - loss: 1.4901e-08
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 1.0000 - loss: 7.1569e-04
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m