# STEP 1: DATA DOWNLOAD

This step installs the Kaggle API, uploads your authentication file, and downloads/unzips the EEG confusion dataset.

In [None]:
!pip install kaggle
from google.colab import files
files.upload()  # Upload your kaggle.json
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d wanghaohan/confused-eeg
!unzip confused-eeg.zip

# STEP 2: LOAD DATA AND BASIC OVERVIEW  

Import Python libraries. Load the EEG data and demographics files. Display shapes, column names, and a quick peek at the EEG data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load the main EEG data file
df = pd.read_csv('EEG_data.csv')
# Load demographic info file (optional)
demo = pd.read_csv('demographic_info.csv')

print("EEG Data Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumns:")
print(df.columns.tolist())

# STEP 3: VISUALIZE EEG BAND POWERS SEPARATED BY CONFUSION

Automatically determine the label column, then plot EEG band powers by confusion label (boxplots for each band, separated by class).

In [None]:
label_col = next((col for col in df.columns if 'predef' in col.lower()),
                 next((col for col in df.columns if 'label' in col.lower()), None))

if label_col is None:
    print("No label column found! Run Step 1 to check.")
else:
    print(f"Using label column: {label_col}")
    bands = ['Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
    band_df = df[bands + [label_col]].melt(id_vars=label_col, var_name='Band', value_name='Power')
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=band_df, x='Band', y='Power', hue=label_col)
    plt.title('EEG Band Powers by Confusion State')
    plt.xticks(rotation=45)
    plt.show()


# STEP 4: FEATURE CORRELATION HEATMAP

Shows correlation of EEG features (and their availability in the dataset) against the label with a heatmap plot.

In [None]:
features = ['Attention', 'Meditation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
label_col = next((col for col in df.columns if 'predef' in col.lower()),
                 next((col for col in df.columns if 'label' in col.lower()), 'label'))
avail_features = [f for f in features if f in df.columns]
corr_data = df[avail_features + [label_col]].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_data, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations (Focus on Label Column)')
plt.show()

# STEP 5: CLASSIFICATION (Random Forest Example)

Splits the data, trains a classifier, prints evaluation metrics and confusion matrix, then displays feature importances for the model.

In [None]:
X = df[avail_features]
y = df[label_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()
# Feature Importances
importances = pd.DataFrame({'Feature': avail_features, 'Importance': model.feature_importances_}).sort_values('Importance', ascending=False)
print(importances)
plt.figure(figsize=(8, 5))
sns.barplot(data=importances, x='Importance', y='Feature')
plt.title('Feature Importance for Confusion Prediction')
plt.show()