In [1]:
import pandas as pd
import numpy as np
import os
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from scipy import signal
import math
from multiprocessing import Pool

# Path to the directory containing your CSV files
  # Update this path to your specific directory




In [7]:
def datacaller(filename, Comp_Freq, timewindow, stepsize, samplerate, **kwargs):
    
    def component_maker(df, timewindow, stepsize, samplerate, **kwargs):
        bl_length = kwargs.get('bl_length', 0)
        
        baseline = df.iloc[:, 0:int((bl_length/1000)*samplerate)].mean(axis=1)
        
        startp = int((bl_length/1000)*samplerate)+1
        endvar = 0
        ec = 1
        while startp < len(df.iloc[0, :]) and endvar == 0:
            endp = int(startp+(timewindow/1000)*samplerate)
            if endp >= len(df.iloc[0, :]):
                endp = len(df.iloc[0, :])
                endvar = 1
            new_col_name = f'Epoch_{ec}'
            ec += 1
            if startp == int((bl_length/1000)*samplerate)+1 or startp == 0:
                res = pd.DataFrame(df.iloc[:, startp:endp].mean(axis=1)-baseline, columns=[new_col_name])
            else:
                res[new_col_name] = df.iloc[:, startp:endp].mean(axis=1)-baseline
            startp = int(startp+(stepsize/1000)*samplerate)
        j=1
        for i in features:
            res[f"F_{j}"] = i
            j +=1
        return res

    def frequency_maker(df, timewindow, stepsize, samplerate, **kwargs):
        baseline = kwargs.get('baseline', 0)
        freq = kwargs.get('freq', 0)
        
        eeg_data = df.iloc[:, 0:int((baseline/1000)*samplerate)]
        lim = len(df.iloc[0, :])
        frequencies, psd_matrix = signal.welch(eeg_data, fs=samplerate, nperseg=int((baseline/1000)*samplerate))
        basepower = []
        for i in range(len(eeg_data)):
            basepower.append(psd_matrix[i][(np.abs(frequencies - freq)).argmin()])
        basepower = np.array(basepower)

        startp = int((baseline/1000)*samplerate)+1
        endc = 0
        ec = 1
        while startp < lim and endc == 0:
            endp = int(startp+(timewindow/1000)*samplerate)
            if endp >= lim:
                endp = lim
                endc = 1

            eeg_data = df.iloc[:, startp:endp]
            frequencies, psd_matrix = signal.welch(eeg_data, fs=samplerate, nperseg=(endp-startp))
            powers = []
            new_col_name = f'Epoch_{ec}'
            ec += 1
            for i in range(len(eeg_data)):
                powers.append(10*(math.log10(psd_matrix[i][(np.abs(frequencies - freq)).argmin()]/basepower[i])))
            if startp == int((baseline/1000)*samplerate)+1:
                res = pd.DataFrame(powers, columns=[new_col_name])
            else:
                res[new_col_name] = powers
            startp = int(startp+(stepsize/1000)*samplerate)
        j=1
        for i in features:
            res[f"F_{j}"] = i
            j +=1
        return res
    
    df = pd.read_csv(filename).T
    features = filename.replace(directory,'').replace('.csv','').replace('/','').split("_")
    additional_args = {}
    for key, value in kwargs.items():
        additional_args[key] = value
    
    if Comp_Freq == "Comp":
        return component_maker(df, timewindow, stepsize, samplerate, **additional_args)
    elif Comp_Freq == "Freq":
        return frequency_maker(df, timewindow, stepsize, samplerate, **additional_args)
    else:
        print("ISSUE")
        
def parallel_datacaller(args):
    # Unpack the arguments
    filename, Comp_Freq, timewindow, stepsize, samplerate, kwargs = args
    # Call the datacaller function with the specified arguments
    return datacaller(filename, Comp_Freq, timewindow, stepsize, samplerate, **kwargs)

In [8]:
from tqdm import tqdm
if __name__ == '__main__':
    directory = "C:/Users/Pitti/Desktop/Study2_AI/Data/RawEEG_ERP/"

    # Get the list of files in the directory
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv') and '_S_' in f]


    # Define the arguments for datacaller function
    args_list = [(f, "Comp", 100, 50, 500, {'bl_length': 500}) for f in files]
    print(len(args_list))
    
    res = []
    for i in args_list:
        res.append(parallel_datacaller(i))

    # Create a Pool of workers
    #with Pool(processes=4) as pool:  # Adjust the number of processes as needed
        # Execute datacaller function in parallel
        #results = list(tqdm(pool.imap(parallel_datacaller, args_list), total=len(args_list)))
        #results = pool.map(parallel_datacaller, args_list)

    # Concatenate the resulting DataFrames into one big DataFrame
    df = pd.concat(res, axis=0)

    X_categorical = pd.get_dummies(df['F_1', 'F_3', 'F_5', 'F_6'])
    X_continuous = [col for col in df.columns if 'Ep' in col]

    # Combine categorical and continuous variables
    X = pd.concat([X_categorical, X_continuous], axis=1)

    # Assuming 'Mindset' is the target variable for classification
    y = df['F_4']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Initialize and train a RandomForestClassifier model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = rf_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    
    
    feature_selector = SelectFromModel(rf_model, threshold='median')

    # Fit the feature selector to the training data
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    X_test_selected = feature_selector.transform(X_test)
    # Get the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)
    selected_features = X_train.columns[selected_feature_indices]

    # Print the selected features
    print("Selected Features:", selected_features)
    # Define the hyperparameters grid
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_selected, y_train)

    # Predict on the test set
    y_pred = rf_model.predict(X_test_selected)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    # Initialize the RandomForestClassifier model
    rf_model = RandomForestClassifier(random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1)

    # Perform Grid Search to find the best hyperparameters
    grid_search.fit(X_train_selected, y_train)

    # Get the best parameters and the best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Best Hyperparameters:", best_params)
    print("Best Score (Accuracy):", best_score)

    # Train the model with the best hyperparameters
    best_rf_model = RandomForestClassifier(**best_params, random_state=42)
    best_rf_model.fit(X_train_selected, y_train)

    # Predict on the test set
    y_pred = best_rf_model.predict(X_test_selected)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Set Accuracy:", accuracy)

12400


KeyError: ('F_1', 'F_3', 'F_5', 'F_6')

In [13]:
X_categorical = pd.get_dummies(df[['F_1', 'F_3', 'F_5', 'F_6']])
X_continuous = df[[col for col in df.columns if 'Ep' in col]]

# Combine categorical and continuous variables
X = pd.concat([X_categorical, X_continuous], axis=1)

# Assuming 'Mindset' is the target variable for classification
y = df['F_4']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize and train a RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")



feature_selector = SelectFromModel(rf_model, threshold='median')

# Fit the feature selector to the training data
X_train_selected = feature_selector.fit_transform(X_train, y_train)
X_test_selected = feature_selector.transform(X_test)
# Get the selected features
selected_feature_indices = feature_selector.get_support(indices=True)
selected_features = X_train.columns[selected_feature_indices]

# Print the selected features
print("Selected Features:", selected_features)
# Define the hyperparameters grid
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
# recode the Subject number and the features colnames
# apply model to O group
# Try it with frequencies
# Develop big model, add it to Cluster

Accuracy: 0.8373118279569892
Selected Features: Index(['F_1_NTh', 'F_1_Th', 'F_3_05', 'F_3_06', 'F_3_07', 'F_3_08', 'F_3_09',
       'F_3_19', 'F_3_20', 'F_3_22', 'F_3_23', 'F_3_24', 'F_3_25', 'F_3_26',
       'F_3_27', 'F_3_32', 'F_3_33', 'F_3_35', 'F_3_38', 'F_5_post', 'F_5_pre',
       'Epoch_1', 'Epoch_2', 'Epoch_3', 'Epoch_4', 'Epoch_5', 'Epoch_6',
       'Epoch_7', 'Epoch_8', 'Epoch_9', 'Epoch_10', 'Epoch_11', 'Epoch_12',
       'Epoch_13', 'Epoch_14', 'Epoch_15', 'Epoch_16', 'Epoch_17', 'Epoch_18',
       'Epoch_19', 'Epoch_20', 'Epoch_21', 'Epoch_22', 'Epoch_23', 'Epoch_24',
       'Epoch_25', 'Epoch_26', 'Epoch_27', 'Epoch_28', 'Epoch_29', 'Epoch_30',
       'Epoch_31', 'Epoch_32', 'Epoch_33', 'Epoch_34', 'Epoch_35', 'Epoch_36',
       'Epoch_37', 'Epoch_38', 'Epoch_39', 'Epoch_40', 'Epoch_41', 'Epoch_42',
       'Epoch_43', 'Epoch_44', 'Epoch_45', 'Epoch_46', 'Epoch_47', 'Epoch_48',
       'Epoch_49'],
      dtype='object')
Accuracy: 0.8306451612903226


NameError: name 'param_grid' is not defined

In [None]:

# Initialize the RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Perform Grid Search to find the best hyperparameters
grid_search.fit(X_train_selected, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score (Accuracy):", best_score)

# Train the model with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = best_rf_model.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)