In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import pywt

In [3]:
# Define sliding window function
def apply_sliding_window_to_dataframe(dataframe, window_size, overlap_size):
    data_shape = len(dataframe)
    windows = []

    start = 0
    end = window_size

    while end <= data_shape:
        window = dataframe[start:end]
        windows.append(window)
        start = end - overlap_size
        end = start + window_size

    return windows

In [4]:
# Define the z-score normalization function
def z_score_normalization(data):
    if data.size == 0:
        # Handle the case when 'data' is empty or has size 0
        print("Warning: Input data is empty.")
        return data

    if data.ndim == 1:
        # If 'data' is 1D, convert it to a 2D array for consistency
        data = data.reshape(-1, 1)

    mean = np.mean(data, axis=0)
    std_dev = np.std(data, axis=0)

    if np.count_nonzero(std_dev) > 0:
        # Only perform z-score normalization when standard deviation is non-zero
        z_scores = (data - mean) / std_dev
    else:
        # Handle the case when standard deviation is zero to avoid division by zero
        print("Warning: Standard deviation is zero; skipping z-score normalization.")
        z_scores = data

    return z_scores

In [5]:
# Files paths
data_dir = '/content/drive/MyDrive/Files and Data/Data and Events files CSV used'
sort_order_file = '/content/drive/MyDrive/Files and Data/gender and channels csv/participant_ratings_old.csv'
file_path = '/content/drive/MyDrive/Files and Data/gender and channels csv/Labels_gender_File.csv'
filepath2=r'/content/drive/MyDrive/Files and Data/gender and channels csv/DEAP_EEG_channels.csv'



In [6]:
data_files = [file for file in os.listdir(data_dir) if file.endswith("FinalExportData.csv")]

In [7]:

# Initialize a list to store the resulting dataframes
result_dataframes = []
# channel names you want to use for feature extraction
ch_names=['Fp1','F7','F3','FC1','FC5','T7','P7','P8','T8','FC6','FC2','F4','F8','Fp2']
nsub=32
ntrails=40      #14 channels selected




In [8]:
# Iterate over the data files(iterate over subjects)
for data_file in data_files:
    # Extract the subject ID from the data file name
    subject_id = data_file.split("FinalExportData.csv")[0]
    print(f'subject no {subject_id}')
    event_file = subject_id + "Events.csv"

    data_path = os.path.join(data_dir, data_file)
    event_path = os.path.join(data_dir, event_file)

    # Check if the event file exists
    if os.path.exists(event_path):
        # Read the data from S0XFinalExportData.csv into a dataframe
        data_df = pd.read_csv(data_path)

        # Read the events data from S0XEvents.csv into a dataframe
        events_df = pd.read_csv(event_path)

        # Initialize a list to store extracted data for the current subject
        extracted_data_list = []

        # Iterate through the events data to find the start and end points
        for index, row in events_df.iterrows():
            if row['type'] == 'condition 4' or row['type'] == '65284':
                # Calculate the start and end points based on the latency in the same row
                start_point = row['latency'] - (31* 128)
                end_point = row['latency'] - (1 * 128)

                # Extract the data from start_point to end_point for all 32(channels) columns
                extracted_data = data_df.loc[
                    (data_df.index >= round(start_point)) & (data_df.index <= round(end_point)),
                    data_df.columns[1:33]  # Adjust column range as needed
                ]
                ## Reshaping of trials ##
                row1 = extracted_data.shape[0]
                col1 = extracted_data.shape[1]
                # print(f'shape of trail data:{row1} rows {col1}cols {row1/128} sec')


                extracted_data = extracted_data.iloc[:-1, :]

                row1 = extracted_data.shape[0]
                col1 = extracted_data.shape[1]
                # print(f'shape of trial data :{row1} rows {col1}cols {row1/128} sec')

                # Append the extracted data to the list for the current subject
                extracted_data_list.append(extracted_data)
                # print(f'legnt of datalist{len(extracted_data_list)}')

        # Check if the subject has exactly 40 trials
        if len(extracted_data_list) == 40:
            # Add the extracted data for this subject to the result_dataframes list

            result_dataframes.append(extracted_data_list)
            # print(f'leng of total data {len(result_dataframes)}')
        else:
             print(f"Subject {subject_id} has {len(extracted_data_list)} trials. Expected 40 trials.")
print(f'shape of trail data:{row1} rows {col1}cols {row1/128} sec')

subject no S01
subject no S02
subject no S03
subject no S04
subject no S05
subject no S06
subject no S07
subject no S08
subject no S09
subject no S10
subject no S11
subject no S12
subject no S13
subject no S14
subject no S15
subject no S16
subject no S17
subject no S18
subject no S19
subject no S20
subject no S21
subject no S22
subject no S23
subject no S24
subject no S25
subject no S26
subject no S27
subject no S28
subject no S29
subject no S30
subject no S31
subject no S32
shape of trail data:3840 rows 32cols 30.0 sec


In [9]:
#Load the CSV file containing the sort order (Experiment_id)

sort_order_df = pd.read_csv(sort_order_file)


In [10]:
# Create a list to store the reordered data frames for all subjects
reordered_dataframes = []
# print(len(result_dataframes))
# Iterate through subjects and reorder the trials within extracted_data_list based on Experiment_id
for i, subject_data in enumerate(result_dataframes):
    # Get the experiment_id values for the current participant_id
    experiment_ids = sort_order_df[sort_order_df['Participant_id'] == i + 1]['Experiment_id'].tolist()

    # Reorder the trials/subject data for the current subject based on Experiment_id
    reordered_subject_data = [subject_data[experiment_ids.index(j)] for j in range(1, 41)]
    reordered_dataframes.append(reordered_subject_data)

len(reordered_subject_data)
print(f'length of reordered subject data/trials: {len(reordered_subject_data)}')

len(reordered_dataframes)
print(f'length of reorded dataframe: {len(reordered_dataframes)}')

length of reordered subject data/trials: 40
length of reorded dataframe: 32


In [11]:
df = pd.read_csv(file_path)
# Create the 'labels' DataFrame with 'Valence' and 'Arousal' columns
labels = df[['Valence', 'Arousal','Gender']]

In [12]:
## Reordering subjects' nos. 23 to 32 channels to be same as "Twente" location order
# Load the CSV file into a DataFrame
df_channels = pd.read_csv(filepath2)

channels = df_channels[['Channel_name_Twente']]

# Assuming you have your 'reordered_dataframes' list and the 'channels' DataFrame already loaded
# and that 'reordered_dataframes' is a list of DataFrames

# Create a list of the new channel order based on 'Channel_name_Twente' with leading/trailing spaces removed
new_channel_order = [channel.strip() for channel in channels['Channel_name_Twente']]

# Iterate over subjects 23 to 32
for subject_id in range(23,33):  # to do change back to 23 to 33
    # print(subject_id)
    subject_index = subject_id - 1  # Adjust to the 0-based index
    subject_data = reordered_dataframes[subject_index]  # Get the subject's DataFrame from the list

    # Iterate over each trial (assuming there are 40 trials)
    for trial in range(40):
        # Get the data for a single trial
        trial_data = subject_data[trial]

        # Reorder the columns (channels) based on the new channel order
        reordered_trial_data = trial_data[new_channel_order]

        # Update the trial data in the subject's DataFrame
        subject_data[trial] = reordered_trial_data

    # Update the subject's DataFrame in the 'reordered_dataframes' list
    reordered_dataframes[subject_index] = subject_data

len(reordered_dataframes)

32

In [13]:
# Create a list to store the sliding window data
new_reorderdataframe_slidingwindow = []
# Iterate over subjects
for subject_data in reordered_dataframes:
    subject_windows = []

    # Iterate over each trial in the subject's data
    for trial_data in subject_data:
        # print(len(trial_data))
        trial_windows = apply_sliding_window_to_dataframe(trial_data, window_size=640, overlap_size=512)
        # print(len(trial_windows))
        subject_windows.append(trial_windows)
        # print(f'length of trial window{len(trial_windows)} sub win{len(subject_windows)}')

    new_reorderdataframe_slidingwindow.append(subject_windows)
    tt=np.array(subject_windows)
    print(tt.shape)

    # print(f'length of sliding window {len(new_reorderdataframe_slidingwindow)}')


(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)
(40, 26, 640, 32)


In [14]:
tt=np.array(new_reorderdataframe_slidingwindow)
print(tt.shape)

(32, 40, 26, 640, 32)


In [15]:
# nepoch is based on  window length and  overlap
# for example: for 5 sec 4 sec overlap 30 second trail you will get 26 epochs

nepochs=len(trial_windows)
# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
labels = df[['Valence', 'Arousal','Gender']]
labels = labels.loc[labels.index.repeat(nepochs)].reset_index(drop=True)
print(nepochs);
check=np.array(labels)
print(check.shape)

26
(33280, 3)


In [16]:

class_labels=[]

def calculate_binary_labels_val(row):
# Iterate through the data and classify into four classes
    for value in df:

        #if value[0] > 4.5 and value[1] > 4.5:
        if row['Valence'] > 4.5 :

            class_labels.append(1)
            return '1'
        elif row['Valence'] <= 4.5:
            class_labels.append(0)
            return '0'

def calculate_binary_labels_arousal(row):

# Iterate through the data and classify into four classes
    for value in df:

        #if value[0] > 4.5 and value[1] > 4.5:
        if row['Arousal'] > 4.5 :

            class_labels.append(1)
            return '1'
        elif row['Arousal'] <= 4.5:
            class_labels.append(0)
            return '0'
            print(labels.head)

In [17]:
# Calculate the 'Binary Label' and create a new column in the 'df' DataFrame
labels['Binary Label val'] = labels.apply(calculate_binary_labels_val,axis=1)
labels['Binary Label arl'] = labels.apply(calculate_binary_labels_arousal,axis=1)
# Create a new DataFrame with both 'Binary Label' and 'Gender' columns
labels_gender = labels[['Binary Label val', 'Binary Label arl','Gender']]

# Save the 'labels_gender' DataFrame to a new CSV file if needed
labels_gender.to_csv('/content/drive/MyDrive/Files and Data/gender and channels csv/binary_labels_5-4FeatureNorm.csv', index=False)

In [18]:
from scipy.stats import entropy
## Define the z-score normalization function
def z_score_normalization(data):
    mean = np.mean(data, axis=0)
    std_dev = np.std(data, axis=0)
    z_scores = (data - mean) / std_dev
    return z_scores


# wavelet features calculation

In [19]:
# Initialize a list to store the features for each subject
features_per_subject = []
expanded_label=[]
# Loop through each subject
for subject_data in new_reorderdataframe_slidingwindow:
    subject_features = []  # Initialize a list to store features for this subject
    sublable=[]
    # Iterate through trials for this subject
    for trial_data in subject_data:
        trial_features = []  # Initialize a list to store features for this trial

        # Iterate through the windows
        for window_data in trial_data:
            # Initialize a list to store features for this window
            window_features = []
            #  Select channales  as per ch_names
            window_data=window_data[ch_names]
            # print(window_data.head())
            window_data = np.array(window_data)
            # print(f'the shape of widow{window_data.shape}')
            # Iterate through channels
            for channel_data in window_data.T:  # Transpose to loop over channels
                ## Switch between doing or cancelling data normalization in next two lines ##
                #channel_normalized = z_score_normalization(channel_data)
                channel_normalized = channel_data # No need to do data normalization as this will be done on the features access the ML

                # Apply db4 wavelet transform
                coeffs = pywt.wavedec(channel_normalized, 'db4', level=3)

                # Extract D1, D2, and D3 coefficients
                d_coefficients = [np.array(coeffs[i]) for i in range(1, 4)]

                # Inside the loop where you calculate energy and entropy for each feature
                for feature in [coeffs[i] for i in range(1, 4)]:

                    energy = np.sum(np.square(feature))  # Energy
                    squared_values = np.square(feature)
                    entropy = -np.sum(squared_values * np.log10(squared_values + 1e-10))  # Entropy (with a small epsilon to avoid log(0))

                    median = np.nanpercentile(feature, 50)
                    var = np.nanvar(feature)
                    rms = np.nanmean(np.sqrt(np.square(feature)))
                    n5 = np.nanpercentile(feature, 5)
                    n25 = np.nanpercentile(feature, 25)
                    n75 = np.nanpercentile(feature, 75)
                    n95 = np.nanpercentile(feature, 95)

                    # append energy, entropy, and statistics to the window_features
                    window_features.extend([energy, entropy, median, var, rms, n5, n25, n75, n95])

            # Append the features for this window to the trial_features
            trial_features.append(window_features)

            # print(f'trail feat lenght{len(trial_features)}')

        # Append the features for all windows in this trial to the subject_features
        subject_features.append(trial_features)
        # tt=np.array(subject_features)
        # print(tt.shape)

    # Append the features for all trials for this subject to the features_per_subject
    features_per_subject.append(subject_features)
    print(f'sub feat lenght{len(features_per_subject)}')


sub feat lenght1
sub feat lenght2
sub feat lenght3
sub feat lenght4
sub feat lenght5
sub feat lenght6
sub feat lenght7
sub feat lenght8
sub feat lenght9
sub feat lenght10
sub feat lenght11
sub feat lenght12
sub feat lenght13
sub feat lenght14
sub feat lenght15
sub feat lenght16
sub feat lenght17
sub feat lenght18
sub feat lenght19
sub feat lenght20
sub feat lenght21
sub feat lenght22
sub feat lenght23
sub feat lenght24
sub feat lenght25
sub feat lenght26
sub feat lenght27
sub feat lenght28
sub feat lenght29
sub feat lenght30
sub feat lenght31
sub feat lenght32


In [20]:
# Reshape and save the feature to disk  for ML as numpy file(.npy)
features_1=np.array(features_per_subject)
print(features_1.shape)

## add features for each window as one separate trial/epoch
features=features_1.reshape(features_1.shape[0]*features_1.shape[1]*features_1.shape[2],features_1.shape[3])
print(f' feature  mat:{features.shape}')

# save to disk
np.save('/content/drive/MyDrive/Files and Data/gender and channels csv/wavelet_feature_5-4FeatureNorm.npy',features)


(32, 40, 26, 378)
 feature  mat:(33280, 378)


In [None]:
from scipy.stats import entropy
features_per_subject = []
expanded_label=[]
# Loop through each subject
for subject_data in new_reorderdataframe_slidingwindow:
    subject_features = []  # Initialize a list to store features for this subject
    sublable=[]
    # Iterate through trials for this subject
    for trial_data in subject_data:
        trial_features = []  # Initialize a list to store features for this trial

        # Iterate through the 5 windows
        for window_data in trial_data:
            # Initialize a list to store features for this window
            window_features = []

            window_data=window_data[ch_names]
            # print(window_data.head())
            window_data = np.array(window_data)
            # print(f'the shape of widow{window_data.shape}')
            # Iterate through channels
            for channel_data in window_data.T:  # Transpose to loop over channels
                # Calculate discrete entropy for the EEG data in this channel
                segment_size = 100
                num_segments = len(channel_data) // segment_size
                segment_discrete_entropy = []

                for i in range(num_segments):
                    start_idx = i * segment_size
                    end_idx = (i + 1) * segment_size
                    segment = channel_data[start_idx:end_idx]
                    pdf, bins = np.histogram(segment, bins='auto', density=True)

                    segment_entropy = entropy(pdf, base=2)
                    trial_discrete_entropy.append(segment_entropy)

                # Calculate the mean DE across segments for one trial in one channel
                channel_mean_entropy = np.mean(trial_discrete_entropy)

                # Append the calculated trial DE feature to the window form of data (14 channels)
                window_features.append(channel_mean_entropy)
                print(f'window feat lenght{len(window_features)}')

            # Append the features for this window to the trial_features
            trial_features.append(window_features)

            print(f'trail feat lenght{len(trial_features)}')

        # Append the features for all windows in this trial to the subject_features
        subject_features.append(trial_features)
        print(f'sub feat lenght{len(subject_features)}')

    # Append the features for all trials for this subject to the features_per_subject
    features_per_subject.append(subject_features)
    print(f'sub feat lenght{len(features_per_subject)}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
window feat lenght10
window feat lenght11
window feat lenght12
window feat lenght13
window feat lenght14
trail feat lenght19
window feat lenght1
window feat lenght2
window feat lenght3
window feat lenght4
window feat lenght5
window feat lenght6
window feat lenght7
window feat lenght8
window feat lenght9
window feat lenght10
window feat lenght11
window feat lenght12
window feat lenght13
window feat lenght14
trail feat lenght20
window feat lenght1
window feat lenght2
window feat lenght3
window feat lenght4
window feat lenght5
window feat lenght6
window feat lenght7
window feat lenght8
window feat lenght9
window feat lenght10
window feat lenght11
window feat lenght12
window feat lenght13
window feat lenght14
trail feat lenght21
window feat lenght1
window feat lenght2
window feat lenght3
window feat lenght4
window feat lenght5
window feat lenght6
window feat lenght7
window feat lenght8
window feat lenght9
window feat lenght10

 # **DE Feature extraction**

In [None]:
features_1=np.array(features_per_subject)
print(features_1.shape)
features=features_1.reshape(features_1.shape[0]*features_1.shape[1]*features_1.shape[2],features_1.shape[3])
print(f' feature  mat:{features.shape}')
np.save('/content/drive/MyDrive/Files and Data/gender and channels csv/DE_feature_5-4FeatureNorm.npy',features)


(32, 40, 27, 14)
 feature  mat:(34560, 14)
