# Data process

In [1]:
import scipy.io as sio 
import h5py, os,mne
import numpy as np
import os


## read EEG

In [2]:
path = '/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/RawEEGdata'
file_path = os.path.join(path, 'S2.mat')
data = sio.loadmat(file_path)

In [3]:
# file_path
if os.path.isfile(file_path):
    print("File exists and is ready to be opened.")
else:
    print("File not found. Please check the path:", file_path)

File exists and is ready to be opened.


In [4]:
# Check MAT file version and read it using SciPy library
def extract_data_with_scipy(mat_file):
    """
    Extract specific field data from a .mat file using SciPy for reading
    """
    mat_data = sio.loadmat(mat_file)
    
    eeg_data_list = []  # List to store EEG data
    stimuli_list = []   # List to store stimuli data
    attended_ear_list = []  # List to store attended ear data

#     # Print the content of mat_data to check its structure
#     print("\nKeys in mat_data:", mat_data.keys())
    trials = mat_data.get('trials', None)

    if trials is not None:
        for i in range(trials.shape[1]):
            trial = trials[0, i]
            
            # Check for RawData
            if 'RawData' in trial.dtype.names:
                raw_data = trial['RawData'][0, 0]
                eeg_data = raw_data['EegData']
                eeg_data_list.append(eeg_data)
            
            # Check for stimuli
            if 'stimuli' in trial.dtype.names:
                stimuli = trial['stimuli']
                stimuli_list.append(stimuli)
            
            # Check for attended_ear
            if 'attended_ear' in trial.dtype.names:
                attended_ear = trial['attended_ear']
                attended_ear_list.append(attended_ear.item())
                
    return eeg_data_list, stimuli_list, attended_ear_list


In [5]:
# Example usage
eeg_data_list, stimuli_list, attended_ear_list = extract_data_with_scipy(file_path)

# # Print or manipulate the extracted data
# print("EEG Data: ", eeg_data_list)
# print("Stimuli: ", stimuli_list)
# print("Attended Ear: ", attended_ear_list)

In [6]:
import numpy as np

# Extract filenames and simplify the structure
def extract_filenames(stimuli):
    filenames = []
    for trial in stimuli:
        trial_filenames = []
        for file_array in trial[0, 0]:
            trial_filenames.append([item[0] for item in file_array])
        filenames.append(trial_filenames)
    return filenames

# Extract and simplify filenames
simplified_filenames = extract_filenames(stimuli_list)

# Comprehensive data extraction function
def process_data(stimuli_list, attended_ear_list, eeg_data_list):

    # Simplify stimuli filenames
    stimuli_list_simplified = extract_filenames(stimuli_list)

    # Extract 'R' or 'L' for attended ear
    attended_ear_simplified = [ear[0] for ear in attended_ear_list]

    # Simplify EEG data
    eeg_data_list_simplified = [ear[0][0] for ear in eeg_data_list]
    
    # Convert to numpy arrays
    attended_ear_simplified = np.array(attended_ear_simplified)
    stimuli_list_simplified = np.array(stimuli_list_simplified)

    return stimuli_list_simplified, attended_ear_simplified, eeg_data_list_simplified

# Call the function to process the data
stimuli_filenames, attended_ear_simplified, eeg_data_list_simplified = process_data(stimuli_list, attended_ear_list, eeg_data_list)

# Print results
# print('Simplified Filenames:', simplified_filenames)

# for files in stimuli_filenames:
#     print(files)
# print('Attended Ear Simplified:', attended_ear_simplified)
# print('Number of Attended Ear:', len(attended_ear_simplified))
# print('EEG Data List Simplified:', eeg_data_list_simplified)
# print('Number of EEG Data:', len(eeg_data_list_simplified))


In [7]:
eeg_data_list, stimuli_list, attended_ear_list = extract_data_with_scipy(file_path)
stimuli_filenames, attended_ear_simplified, eeg_data_list_simplified = process_data(stimuli_list, attended_ear_list, eeg_data_list)

In [8]:
print(attended_ear_simplified)

['L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R' 'L' 'R'
 'L' 'R']


## Read audio

In [9]:
file_names = np.array(stimuli_filenames)
print(file_names.shape[0])

20


In [10]:
import scipy.io.wavfile as wav

# Assume you have a 1x20x2 string array
file_names = np.array(stimuli_filenames)

# Directory where the audio files are stored
audio_dir = "/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/stimuli"

# Initialize a list to store the data of each audio file
audio_data_list = []

# Read each audio file
for i in range(file_names.shape[0]):
    for j in range(file_names.shape[1]):
        file_path = os.path.join(audio_dir, file_names[i, j].item())  # Get full path to the audio file
        sample_rate, audio_data = wav.read(file_path)  # Read the audio file
        audio_data_list.append(audio_data)

# Find the length of the longest audio signal
max_length = max(len(audio) for audio in audio_data_list)

# Initialize a 20x2xmax_length array, filling with 0 or NaN (depending on your needs)
stacked_audio_data = np.zeros((file_names.shape[0], file_names.shape[1], max_length))

# Fill the stacked array with each audio signal
for idx, audio in enumerate(audio_data_list):
    i, j = divmod(idx, file_names.shape[1])  # Calculate the corresponding indices for the 20x2 structure
    stacked_audio_data[i, j, :len(audio)] = audio  # Assign the audio data

# Print the sample rate and the shape of the stacked audio data
print(sample_rate)
print(stacked_audio_data.shape)


44100
(20, 2, 20811935)


In [11]:
import scipy.signal as signal
import librosa

# Define the function to apply the Gammatone filter
def apply_gammatone(audio_data, sample_rate, center_freq=440):
    # Calculate the filter length
    filter_length = int(16000*0.015)  # Choose an appropriate length
    # Use scipy's gammatone function to create the filter
    b, a = signal.gammatone(center_freq, ftype='fir', fs=sample_rate, numtaps=filter_length)
    
    # Apply the Gammatone filter
    filtered_audio = signal.lfilter(b, a, audio_data)
    
    return filtered_audio

def audio_stimuli(file_names):
    # Directory for the audio files
    audio_dir = "/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/stimuli"

    # Initialize a list to store the data for each audio file
    audio_data_list = []

    # Read each file
    for i in range(file_names.shape[0]):
        for j in range(file_names.shape[1]):
            # Get the file name and replace 'hrtf' with 'dry'
            updated_file_name = file_names[i, j].item().replace("hrtf", "dry")
#             print('updated_file_name', updated_file_name)
            file_path = os.path.join(audio_dir, updated_file_name)
            audio_data, sample_rate = librosa.load(file_path, sr=None)
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
            
            # Apply Gammatone filter and add it to the list
            filtered_audio = apply_gammatone(audio_data, sample_rate, center_freq=440)
            audio_data_list.append(filtered_audio)

    # Find the length of the longest audio
    max_length = max(len(audio) for audio in audio_data_list)

    # Initialize a 20x2xmax_length array, fill with NaN or 0 based on your needs
    stacked_audio_data = np.zeros((file_names.shape[0], file_names.shape[1], max_length))

    # Fill the array with each audio signal
    for idx, audio in enumerate(audio_data_list):
        i, j = divmod(idx, file_names.shape[1])
        stacked_audio_data[i, j, :len(audio)] = audio
    
    return stacked_audio_data


# stack_data

In [12]:
def slice_and_stack_data(eeg_data_list, attended_ear, audio_data, time_range, experiment_index):
    # Sampling rate
    eeg_sample_rate = 128
    audio_sample_rate = 16000
    
    # Encode left and right ears as 0 and 1
    ear_label_map = {'R': 1, 'L': 0}
    ear_labels = np.array([ear_label_map[ear] for ear in attended_ear])

    # Select the EEG data, ear labels, and audio data for the specified experiment
    eeg_data = eeg_data_list[experiment_index]
    ear_label = ear_labels[experiment_index]
    audio = audio_data[experiment_index]
    
    # Determine the number of samples per slice for EEG and audio
    eeg_sample_count = int(eeg_sample_rate * time_range)
    audio_sample_count = int(audio_sample_rate * time_range)
    
    
    eeg_length_sec = int(eeg_data.shape[0] // eeg_sample_count)
    audio_length_sec = int(audio.shape[1] // audio_sample_count)
    
#     print('eeg_length_sec, audio_length_sec = ', eeg_length_sec, audio_length_sec)
    
    # Calculate the total number of slices
    total_slices = min(eeg_length_sec, audio_length_sec)

    # Initialize lists to store the sliced data
    sliced_eeg_data = []
    sliced_audio_data = []
    sliced_ear_labels = []

    # Slice and stack the data
    for i in range(total_slices):
        start_eeg_sample = i * eeg_sample_count
        end_eeg_sample = start_eeg_sample + eeg_sample_count
        start_audio_sample = i * audio_sample_count
        end_audio_sample = start_audio_sample + audio_sample_count

        eeg_slice = eeg_data[start_eeg_sample:end_eeg_sample]
        audio_slice = audio[:, start_audio_sample:end_audio_sample]

        # Transpose EEG data to (64, 128)
        eeg_slice = np.transpose(eeg_slice, (1, 0))

        sliced_eeg_data.append(eeg_slice)
        sliced_audio_data.append(audio_slice)
        sliced_ear_labels.append([ear_label])
    
    # Convert the lists into arrays
    stacked_eeg_data = np.array(sliced_eeg_data)
    stacked_audio_data = np.array(sliced_audio_data)
    stacked_ear_labels = np.array(sliced_ear_labels)
    
    return stacked_eeg_data, stacked_audio_data, stacked_ear_labels


# integrate function

In [13]:
def get_sub_experiment_data(root, sub_file, time_range, experiment_index): 
    """
        root = Root directory
        sub_file_path = Select subject
        time_range = Time length [0.5, 1, 1.5, 2]
        experiment_index = Experiment selection [0:1:19]
    """
    file_path = os.path.join(root, sub_file)
    print(file_path)
    print('Processing subject {} experiment {}'.format(sub_file, experiment_index))
    eeg_data_list, stimuli_list, attended_ear_list = extract_data_with_scipy(file_path)  # Read formatted data
    stimuli_filenames, attended_ear, eeg_data_list = process_data(stimuli_list, attended_ear_list, eeg_data_list)  # Remove formatting
    audio_data = audio_stimuli(stimuli_filenames)
    stacked_eeg_data, stacked_audio_data, stacked_ear_labels = slice_and_stack_data(eeg_data_list, attended_ear, audio_data, time_range, experiment_index)
    
    # Print the shapes of the output
    print("EEG_data shape:", stacked_eeg_data.shape)  # Expected output (100, 64, 128)
    print("Audio_data shape:", stacked_audio_data.shape)  # Expected output (100, 2, 16000)
    print("Ear_labels shape:", stacked_ear_labels.shape)  # Expected output (100, 1)

    return stacked_eeg_data, stacked_audio_data, stacked_ear_labels


In [14]:
def get_sub_all_data(root, sub_file, time_range):
    """
        root = Root directory
        sub_file_path = Select subject
        time_range = Time length [0.5, 1, 1.5, 2]
    """
    file_path = os.path.join(root, sub_file)
    print(file_path)
    print('Processing subject {}'.format(sub_file))
    eeg_data_list, stimuli_list, attended_ear_list = extract_data_with_scipy(file_path)  # Read formatted data
    stimuli_filenames, attended_ear, eeg_data_list = process_data(stimuli_list, attended_ear_list, eeg_data_list)  # Remove formatting
    audio_data = audio_stimuli(stimuli_filenames)
    
    all_stacked_eeg_data = []
    all_stacked_audio_data = []
    all_stacked_ear_labels = []
    
    for experiment_index in range(20):
        stacked_eeg_data, stacked_audio_data, stacked_ear_labels = slice_and_stack_data(eeg_data_list, attended_ear, audio_data, time_range, experiment_index)
        all_stacked_eeg_data.append(stacked_eeg_data)
        all_stacked_audio_data.append(stacked_audio_data)
        all_stacked_ear_labels.append(stacked_ear_labels)
    
    # Vertically stack all the experiment data
    all_stacked_eeg_data = np.vstack(all_stacked_eeg_data)
    all_stacked_audio_data = np.vstack(all_stacked_audio_data)
    all_stacked_ear_labels = np.vstack(all_stacked_ear_labels)
    
    # Print the shapes of the output
    print("EEG_data shape:", all_stacked_eeg_data.shape)  # Expected output (2000, 64, 128)
    print("Audio_data shape:", all_stacked_audio_data.shape)  # Expected output (2000, 2, 16000)
    print("Ear_labels shape:", all_stacked_ear_labels.shape)  # Expected output (2000, 1)

    return all_stacked_eeg_data, all_stacked_audio_data, all_stacked_ear_labels


In [15]:
# example
root = '/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/RawEEGdata'
sub_file = 'S3.mat'
time_range = 1  # second
all_stacked_eeg_data, all_stacked_audio_data, all_stacked_ear_labels = get_sub_all_data(root, sub_file, time_range)

/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/RawEEGdata/S3.mat
Processing subject S3.mat
EEG_data shape: (4624, 64, 128)
Audio_data shape: (4624, 2, 16000)
Ear_labels shape: (4624, 1)


## process dataset for each subject

In [17]:
def stack_and_save_data(root, sub_files, time_range, save_path):
    all_eeg_data = []
    all_audio_data = []
    all_ear_labels = []

    for sub_file in sub_files:
        eeg_data, audio_data, ear_labels = get_sub_all_data(root, sub_file, time_range)
        all_eeg_data.append(eeg_data)
        all_audio_data.append(audio_data)
        all_ear_labels.append(ear_labels)

    all_eeg_data = np.array(all_eeg_data)
    all_audio_data = np.array(all_audio_data)
    all_ear_labels = np.array(all_ear_labels)

    # Save as a .npz file
    np.savez('all_data.npz', eeg=all_eeg_data, audio=all_audio_data, ear=all_ear_labels)

In [19]:
root = '/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/RawEEGdata'
sub_files = [f'S{i}.mat' for i in range(1, 17)]
time_range = 1  # Unit: seconds
save_path = '/home/test/Desktop/python/EEG_data/AAD_dataset/AAD_KUL/Dataset_single_drywav'  # Replace with actual save path

stack_and_save_data(root, sub_files, time_range, save_path)
