# Data loading

In [4]:
import os
import pandas as pd
import numpy as np
import scipy.signal
import kagglehub
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models

# --- 1. Get Path ---
print("Locating dataset...")
# If this lines fails or hangs, you can hardcode the path you found earlier:
# path_to_dataset = "/kaggle/input/database-for-emotion-recognition-system-gameemo"
path_to_dataset = kagglehub.dataset_download("sigfest/database-for-emotion-recognition-system-gameemo")
print(f"Dataset Root: {path_to_dataset}")

# --- 2. Configuration ---
# BrainAccess HALO Channels
TARGET_CHANNELS = ['AF3', 'AF4', 'O1', 'O2']
BANDS = {'Delta': (0.5, 4), 'Theta': (4, 8), 'Alpha': (8, 13), 'Beta': (13, 30), 'Gamma': (30, 45)}

def get_band_power(data, fs=128):
    freqs, psd = scipy.signal.welch(data, fs, nperseg=len(data), axis=0)
    total_power = np.sum(psd, axis=0)
    features = []
    for ch_idx in range(data.shape[1]):
        for band, (low, high) in BANDS.items():
            idx = np.logical_and(freqs >= low, freqs <= high)
            val = 0 if total_power[ch_idx] == 0 else np.sum(psd[idx, ch_idx]) / total_power[ch_idx]
            features.append(val)
    return np.array(features)

def load_data_robust(root_path):
    X, y = [], []
    count = 0

    print(f"Starting crawl through: {root_path}")

    # Walk through the directory tree
    for root, dirs, files in os.walk(root_path):
        for filename in files:
            # 1. Filter: specific CSVs only
            if not filename.endswith(".csv"): continue
            if "AllRawChannels" not in filename: continue

            # 2. Determine 4-Class Label
            # G1=LANV(0), G2=LAPV(1), G3=HANV(2), G4=HAPV(3)
            if "G1" in filename:   label = 0
            elif "G2" in filename: label = 1
            elif "G3" in filename: label = 2
            elif "G4" in filename: label = 3
            else: continue # Skip if no G code found

            file_path = os.path.join(root, filename)

            try:
                # 3. Load & Process
                df = pd.read_csv(file_path)

                # Find columns (handle spaces like " AF3")
                cols = [c for c in df.columns if any(t in c for t in TARGET_CHANNELS)]

                if len(cols) < 4:
                    # Optional: Print skipped files to debug
                    # print(f"Skipping {filename}: Found {len(cols)}/4 channels")
                    continue

                raw_data = df[cols].values
                fs = 128

                # Segment (1 sec)
                for i in range(len(raw_data) // fs):
                    window = raw_data[i*fs : (i+1)*fs]
                    X.append(get_band_power(window, fs))
                    y.append(label)

                count += 1
                if count % 10 == 0: print(f"Processed {count} files...")

            except Exception as e:
                print(f"Error on {filename}: {e}")

    return np.array(X), np.array(y)

# --- EXECUTE ---
X, y = load_data_robust(path_to_dataset)

print("\nProcessing Complete.")
print(f"Features: {X.shape}")
print(f"Labels: {y.shape}")
print(f"Classes: {np.unique(y)}")

Locating dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/sigfest/database-for-emotion-recognition-system-gameemo?dataset_version_number=1...


100%|██████████| 1.70G/1.70G [00:13<00:00, 139MB/s]

Extracting files...





Dataset Root: /root/.cache/kagglehub/datasets/sigfest/database-for-emotion-recognition-system-gameemo/versions/1
Starting crawl through: /root/.cache/kagglehub/datasets/sigfest/database-for-emotion-recognition-system-gameemo/versions/1
Processed 10 files...
Processed 20 files...
Processed 30 files...
Processed 40 files...
Processed 50 files...
Processed 60 files...
Processed 70 files...
Processed 80 files...
Processed 90 files...
Processed 100 files...

Processing Complete.
Features: (32184, 20)
Labels: (32184,)
Classes: [0 1 2 3]


In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import copy

In [53]:
if 'X' not in locals() or 'y' not in locals():
    raise ValueError("Please run the data loading cell first.")

print("Step 1: Preparing Hybrid Features...")
# Log transform to normalize EEG power distribution
X_log = np.log1p(X)

# Polynomial Features (Interaction terms)
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_log)

# Standard Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Split Data
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.15, random_state=42, stratify=y
)

# Convert to Tensors for PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train_torch = torch.tensor(X_train_raw, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.long)
X_test_torch = torch.tensor(X_test_raw, dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_torch, y_train_torch)
test_dataset = TensorDataset(X_test_torch, y_test_torch)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

Step 1: Preparing Hybrid Features...


In [54]:
class ResidualBlock(nn.Module):
    def __init__(self, hidden_dim, dropout_rate):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
        )
        self.activation = nn.GELU()
    def forward(self, x):
        return self.activation(x + self.block(x))

class WideResNet(nn.Module):
    def __init__(self, input_dim, num_classes=4):
        super(WideResNet, self).__init__()
        hidden_dim = 1024
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(0.3)
        )
        self.res_blocks = nn.Sequential(
            ResidualBlock(hidden_dim, 0.4),
            ResidualBlock(hidden_dim, 0.4),
            ResidualBlock(hidden_dim, 0.4)
        )
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.res_blocks(x)
        x = self.classifier(x)
        return x

In [55]:
print("\nStep 2: Training WideResNet...")
resnet_model = WideResNet(input_dim=X_scaled.shape[1]).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(resnet_model.parameters(), lr=0.001, weight_decay=1e-3)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.002, steps_per_epoch=len(train_loader), epochs=100)

best_acc = 0.0
best_weights = copy.deepcopy(resnet_model.state_dict())

for epoch in range(100):
    resnet_model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = resnet_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Quick Validation Check
    resnet_model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = resnet_model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    acc = 100 * correct / total
    if acc > best_acc:
        best_acc = acc
        best_weights = copy.deepcopy(resnet_model.state_dict())

    if (epoch+1) % 20 == 0:
        print(f"Epoch {epoch+1}/100 - ResNet Val Acc: {acc:.2f}%")

# Load best weights
resnet_model.load_state_dict(best_weights)
print(f"Best ResNet Accuracy: {best_acc:.2f}%")

# --- 3. Train Model B: Gradient Boosting (Tree-based) ---
print("\nStep 3: Training Gradient Boosting Classifier...")
# HistGradientBoosting is typically faster and more accurate for this scale than standard RF
gb_model = HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_iter=200,
    max_leaf_nodes=31,
    l2_regularization=0.1,
    random_state=42
)
gb_model.fit(X_train_raw, y_train)
gb_acc = gb_model.score(X_test_raw, y_test) * 100
print(f"Gradient Boosting Accuracy: {gb_acc:.2f}%")

# --- 4. Ensemble Prediction (Voting) ---
print("\nStep 4: Combining Models (Ensemble)...")

# Get Probabilities from ResNet
resnet_model.eval()
all_resnet_probs = []
with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        logits = resnet_model(inputs)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        all_resnet_probs.append(probs)
resnet_probs = np.concatenate(all_resnet_probs)

# Get Probabilities from Gradient Boosting
gb_probs = gb_model.predict_proba(X_test_raw)

# Weighted Average (Give slightly more weight to the better model usually, but 50/50 is robust)
# If ResNet is ~77% and GB is ~75%, a 0.6/0.4 split often works well.
final_probs = (0.6 * resnet_probs) + (0.4 * gb_probs)
final_preds = np.argmax(final_probs, axis=1)

# Final Evaluation
ensemble_acc = accuracy_score(y_test, final_preds) * 100
print(f"\n>>> FINAL ENSEMBLE ACCURACY: {ensemble_acc:.2f}% <<<")

class_names = ['LANV (Boring)', 'LAPV (Calm)', 'HANV (Horror)', 'HAPV (Funny)']
print("\nClassification Report:")
print(classification_report(y_test, final_preds, target_names=class_names))


Step 2: Training WideResNet...
Epoch 20/100 - ResNet Val Acc: 51.14%
Epoch 40/100 - ResNet Val Acc: 66.03%
Epoch 60/100 - ResNet Val Acc: 78.67%
Epoch 80/100 - ResNet Val Acc: 82.66%
Epoch 100/100 - ResNet Val Acc: 83.20%
Best ResNet Accuracy: 83.49%

Step 3: Training Gradient Boosting Classifier...
Gradient Boosting Accuracy: 62.72%

Step 4: Combining Models (Ensemble)...

>>> FINAL ENSEMBLE ACCURACY: 83.53% <<<

Classification Report:
               precision    recall  f1-score   support

LANV (Boring)       0.82      0.87      0.85      1207
  LAPV (Calm)       0.83      0.81      0.82      1207
HANV (Horror)       0.83      0.84      0.84      1207
 HAPV (Funny)       0.85      0.82      0.84      1207

     accuracy                           0.84      4828
    macro avg       0.84      0.84      0.84      4828
 weighted avg       0.84      0.84      0.84      4828

