In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
from collections import Counter

input_file_segments = '/content/drive/MyDrive/Project/ExtractedSegments_1s_set4.npy'
input_file_labels = '/content/drive/MyDrive/Project/ExtractedLabelas_1s_set4.npy'

loaded_segments = np.load(input_file_segments, allow_pickle=True)
loaded_labels = np.load(input_file_labels, allow_pickle=True)

print(f'Total segments loaded: {len(loaded_segments)}')
print(f'Total labels loaded: {len(loaded_labels)}')
print(f'Sample segment shape: {loaded_segments[0].shape}')
print(f'Sample label: {loaded_labels[0]}')

label_counts = Counter(loaded_labels)
print(f'Label counts: {label_counts}')

max_count = max(label_counts.values())

def add_gaussian_noise(data, mean=0, std_dev=0.05):
    noise = np.random.normal(mean, std_dev, data.shape)
    return data + noise

augmented_segments = []
augmented_labels = []

for segment, label in zip(loaded_segments, loaded_labels):
    if segment.shape == (125, 16):
        augmented_segments.append(segment)
        augmented_labels.append(label)
    else:
        print(f"Skipping segment with invalid shape: {segment.shape}")

for label, count in label_counts.items():
    if count < max_count:
        label_segments = [seg for seg, lbl in zip(loaded_segments, loaded_labels) if lbl == label]

        num_samples_needed = max_count - count

        for _ in range(num_samples_needed):
            segment = label_segments[np.random.randint(len(label_segments))]
            noisy_segment = add_gaussian_noise(segment)

            if noisy_segment.shape == (125, 16):
                augmented_segments.append(noisy_segment)
                augmented_labels.append(label)
            else:
                print(f"Generated segment with invalid shape: {noisy_segment.shape}")

augmented_segments = np.array(augmented_segments)
augmented_labels = np.array(augmented_labels)

print(f'Augmented data shape: {augmented_segments.shape}')
print(f'Augmented labels shape: {augmented_labels.shape}')
print(f'New label counts: {Counter(augmented_labels)}')

Total segments loaded: 937
Total labels loaded: 937
Sample segment shape: (125, 16)
Sample label: Walking
Label counts: Counter({'Walking': 290, 'Aha': 290, 'Doing Other Task': 290, 'Impasse': 42, 'Re-evaluation': 25})
Augmented data shape: (1450, 125, 16)
Augmented labels shape: (1450,)
New label counts: Counter({'Walking': 290, 'Aha': 290, 'Doing Other Task': 290, 'Impasse': 290, 'Re-evaluation': 290})


In [None]:
data = augmented_segments
final_labels = augmented_labels

# Feature Extraction

In [None]:
import numpy as np

def compute_mean(data):
    return np.mean(data, axis=0)

mean_features = np.array([compute_mean(subject_data) for subject_data in data])

def compute_variance(data):
    return np.var(data, axis=0)

variance_features = np.array([compute_variance(subject_data) for subject_data in data])

def compute_first_diff(data):
    return np.mean(np.abs(np.diff(data, axis=0)), axis=0)

first_diff_features = np.array([compute_first_diff(subject_data) for subject_data in data])

def compute_second_diff(data):
    return np.mean(np.abs(np.diff(data, n=2, axis=0)), axis=0)

second_diff_features = np.array([compute_second_diff(subject_data) for subject_data in data])

combined_time_features = np.concatenate(
    [mean_features, variance_features, first_diff_features, second_diff_features], axis=1)

In [None]:
import numpy as np
from scipy.signal import spectrogram


def extract_frequency_features(segments):

    feature_vectors = []

    for segment in segments:

        channel_features = []


        for channel_data in segment.T:

            freqs, times, Sxx = spectrogram(channel_data, fs=125, nperseg=16)

            Sxx_magnitude = np.abs(Sxx)

            mean_power = np.mean(Sxx_magnitude)
            std_power = np.std(Sxx_magnitude)
            peak_freq = freqs[np.argmax(np.mean(Sxx_magnitude, axis=1))]

            channel_features.extend([mean_power, std_power, peak_freq])


        feature_vectors.append(channel_features)

    return np.array(feature_vectors)

frequency_features = extract_frequency_features(data)

print(frequency_features.shape)

(1450, 48)


In [None]:
import numpy as np
from scipy.signal import spectrogram


def extract_frequency_features_full_spectrum(segments):
    feature_vectors = []

    for segment in segments:
        channel_features = []

        for channel_data in segment.T:
            freqs, times, Sxx = spectrogram(channel_data, fs=125, nperseg=16)

            Sxx_magnitude = np.abs(Sxx)

            mean_power = np.mean(Sxx_magnitude)

            std_power = np.std(Sxx_magnitude)

            peak_freq = freqs[np.argmax(np.mean(Sxx_magnitude, axis=1))]

            flat_Sxx = Sxx_magnitude.mean(axis=1)
            lower_freq = freqs[np.searchsorted(np.cumsum(flat_Sxx), 0.25 * np.sum(flat_Sxx))]
            upper_freq = freqs[np.searchsorted(np.cumsum(flat_Sxx), 0.75 * np.sum(flat_Sxx))]
            bandwidth = upper_freq - lower_freq

            channel_features.extend([mean_power, std_power, peak_freq, bandwidth])

        feature_vectors.append(channel_features)

    return np.array(feature_vectors)

frequency_features_full = extract_frequency_features_full_spectrum(data)

print(frequency_features_full.shape)

(1450, 64)


In [None]:
!pip install PyWavelets



In [None]:
import numpy as np
from scipy.stats import entropy
import pywt
from statsmodels.tsa.ar_model import AutoReg

def extract_ar_coefficients(data, order=1):
    ar_coeffs = []
    for segment in data:
        if len(segment.shape) > 1:
            segment = segment.flatten()
        model = AutoReg(segment, lags=order).fit()
        ar_coeffs.append(model.params)

    return np.array(ar_coeffs)




def extract_wavelet_variance(data, wavelet='db2', level=5, fixed_length=5):
    wavelet_variances = []
    for segment in data:
        max_level = pywt.dwt_max_level(len(segment), wavelet)
        coeffs = pywt.wavedec(segment, wavelet, level=min(level, max_level))

        variances = [np.var(c) for c in coeffs]

        if len(variances) < fixed_length:
            variances = np.pad(variances, (0, fixed_length - len(variances)), 'constant')
        else:
            variances = variances[:fixed_length]

        wavelet_variances.append(variances)

    return np.array(wavelet_variances)

def extract_features(data, ar_order=1, wavelet='db2', level=5, fixed_length=5):
    ar_features = extract_ar_coefficients(data, order=ar_order)
    wavelet_var_features = extract_wavelet_variance(data, wavelet=wavelet, level=level, fixed_length=fixed_length)

    combined_features = []
    for ar,  wv in zip(ar_features, wavelet_var_features):
        combined = np.hstack((ar, wv))
        combined_features.append(combined)

    return np.array(combined_features)

wv_features = extract_features(data)

print(f'Features matrix shape: {wv_features.shape}')



Features matrix shape: (1450, 7)


In [None]:
import numpy as np
from scipy.signal import welch

def extract_frequency_features(data, sfreq):
    n_samples, n_times, n_channels = data.shape
    freq_bands = {
        'delta': (0.5, 4),
        'theta': (4, 8),
        'alpha/mu': (8, 12),
        'low_alpha': (8, 10),
        'high_alpha': (10, 12),
        'beta': (12, 30),
        'low_beta': (12, 15),
        'mid_beta': (15, 20),
        'high_beta': (20, 30),
        'gamma': (30, 100),
        'low_gamma': (30, 50),
        'high_gamma': (50, 100),
        'epsilon': (0.1, 0.5),
        'sigma': (12, 16),
        'high_frequency_oscillations': (100, 500),
        'ripples': (80, 200),
        'fast_ripples': (200, 500)
    }

    freq_features = np.zeros((n_samples, n_channels, len(freq_bands)))

    for i in range(n_samples):
        for j in range(n_channels):
            f, psd = welch(data[i, j, :], sfreq, nperseg=min(256, n_times))

            for band, (f_low, f_high) in freq_bands.items():
                idx_band = np.where((f >= f_low) & (f < f_high))[0]
                if len(idx_band) > 0:
                    power_in_band = np.mean(psd[idx_band])
                else:
                    power_in_band = 0.0
                freq_features[i, j, list(freq_bands.keys()).index(band)] = power_in_band

    return freq_features

n_subjects, n_times, n_channels = data.shape
reshaped_data = data.reshape(n_subjects, n_times, n_channels)
X_freq_features = extract_frequency_features(reshaped_data, sfreq=125)

print('Shape of X_freq_features:', X_freq_features.shape)

  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freq_features[i, j, list(freq_bands.keys()).index(band)] = power_in_band


Shape of X_freq_features: (1450, 16, 17)


In [None]:
def standardize_data(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    std[std == 0] = 1

    standardized_data = (data - mean) / std
    return standardized_data


X_standardized_freq_features = standardize_data(X_freq_features)
print("Standardized data shape:", X_standardized_freq_features.shape)

Standardized data shape: (1450, 16, 17)


In [None]:
flattened_data_freq_features = X_standardized_freq_features.reshape(X_standardized_freq_features.shape[0], 16 * 17)

print(flattened_data_freq_features.shape)

(1450, 272)


# Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(final_labels)

concatenated_array = np.concatenate([flattened_data_freq_features,wv_features,frequency_features_full,frequency_features, combined_time_features], axis=1)
concatenated_array_features = np.array(concatenated_array, dtype=np.float32)

X_train, X_test, y_train, y_test = train_test_split(concatenated_array_features, encoded_labels, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB()
}

results = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("")

results_df = pd.DataFrame(results).T
print("\nSummary of Classifier Performance:")
print(results_df)

Results for SVM:
Accuracy: 0.3310
Precision: 0.2945
Recall: 0.3310
F1 Score: 0.2707

Results for Random Forest:
Accuracy: 0.4862
Precision: 0.4588
Recall: 0.4862
F1 Score: 0.4693



Parameters: { "use_label_encoder" } are not used.



Results for XGBoost:
Accuracy: 0.5069
Precision: 0.4816
Recall: 0.5069
F1 Score: 0.4921

Results for Gradient Boosting:
Accuracy: 0.5310
Precision: 0.5145
Recall: 0.5310
F1 Score: 0.5201





Results for AdaBoost:
Accuracy: 0.2966
Precision: 0.3190
Recall: 0.2966
F1 Score: 0.3033

Results for K-Nearest Neighbors:
Accuracy: 0.4966
Precision: 0.4482
Recall: 0.4966
F1 Score: 0.4488

Results for Logistic Regression:
Accuracy: 0.3828
Precision: 0.3369
Recall: 0.3828
F1 Score: 0.3345

Results for Naive Bayes:
Accuracy: 0.2069
Precision: 0.3508
Recall: 0.2069
F1 Score: 0.1596


Summary of Classifier Performance:
                     Accuracy  Precision    Recall  F1 Score
SVM                  0.331034   0.294520  0.331034  0.270730
Random Forest        0.486207   0.458799  0.486207  0.469257
XGBoost              0.506897   0.481638  0.506897  0.492112
Gradient Boosting    0.531034   0.514507  0.531034  0.520132
AdaBoost             0.296552   0.318973  0.296552  0.303315
K-Nearest Neighbors  0.496552   0.448178  0.496552  0.448762
Logistic Regression  0.382759   0.336937  0.382759  0.334524
Naive Bayes          0.206897   0.350755  0.206897  0.159623


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
