In [1]:
import numpy as np
import pandas as pd
from scipy.signal import butter, sosfiltfilt, filtfilt # Use filtfilt as in the example, or sosfiltfilt if preferred
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split # Optional: for local validation
from sklearn.metrics import classification_report # Optional: for local validation

In [None]:

from pathlib import Path
from torch.utils.data import DataLoader
# The EEGDataset class definition is needed. Assuming it's defined elsewhere
# If not, we'll need its definition. For now, assume it exists.
from seiz_eeg.dataset import EEGDataset # Placeholder from user example
from hjorth_features import create_hjorth_transforms

# --- Data Loading and Preparation ---

data_path = "./data"
DATA_ROOT = Path(data_path)

if not DATA_ROOT.exists():
     print(f"Error: Data root path does not exist: {DATA_ROOT}")
     print("Please ensure the 'data' directory with train/test subfolders is correctly placed.")
     # You might want to stop execution here if data is missing
     exit()


print("Loading segment metadata...")
clips_tr = pd.read_parquet(DATA_ROOT / "train/segments.parquet")
clips_te = pd.read_parquet(DATA_ROOT / "test/segments.parquet")
print(f"Loaded {len(clips_tr)} training segments and {len(clips_te)} test segments.")


# Instantiate the feature extractor
hjorth_extractor, _ = create_hjorth_transforms()

# Create datasets with the Hjorth feature extractor
print("Creating training dataset with Hjorth features...")
dataset_tr_hjorth = EEGDataset(
    clips_tr,
    signals_root=DATA_ROOT / "train",
    signal_transform=hjorth_extractor,
    prefetch=True, # Use prefetch if memory allows, might be slow otherwise
    return_id=False # Get labels for training
)

print("Creating test dataset with Hjorth features...")
dataset_te_hjorth = EEGDataset(
    clips_te,
    signals_root=DATA_ROOT / "test",
    signal_transform=hjorth_extractor,
    prefetch=True,
    return_id=True # Get IDs for submission
)

# Load all data into memory for scikit-learn training
# This might take time and memory depending on the dataset size

def load_all_data(dataset):
    features = []
    labels_or_ids = []
    print(f"Loading all data from dataset ({len(dataset)} samples)...")
    # Use DataLoader for potential batching benefits during loading, though batch_size=1 is fine too
    loader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=2) # Adjust batch_size/num_workers

    # --- DataLoader Method ---
    from tqdm import tqdm
    for batch_data, batch_labels_or_ids in tqdm(loader, desc="Loading data"):
         # Assuming data is already processed numpy array by dataset
         # If data comes as tensors, convert: batch_data.numpy()
         features.extend(list(batch_data.numpy())) # Store features as list of numpy arrays
         labels_or_ids.extend(list(batch_labels_or_ids)) # Store labels/ids
    # --- End DataLoader Method ---


    return np.array(features), labels_or_ids # Convert features list to a single large NumPy array


X_train, y_train = load_all_data(dataset_tr_hjorth)
# Test data returns IDs instead of labels
X_test, test_ids = load_all_data(dataset_te_hjorth)

# Reshape data to (n_samples, n_features)
X_train = X_train.reshape(X_train.shape[0], -1)  # Flatten each sample
X_test = X_test.reshape(X_test.shape[0], -1)    # Flatten each sample

print(f"Training data shape: X={X_train.shape}, y={len(y_train)}")
print(f"Test data shape: X={X_test.shape}, ids={len(test_ids)}")

# Convert y_train list to numpy array for sklearn compatibility
y_train = np.array(y_train)


Loading segment metadata...
Loaded 12993 training segments and 3614 test segments.
Creating training dataset with Hjorth features...


Creating test dataset with Hjorth features...
Loading all data from dataset (12993 samples)...


Loading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 204/204 [00:00<00:00, 639.30it/s]


Loading all data from dataset (3614 samples)...


Loading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:00<00:00, 483.01it/s]

Training data shape: X=(12993, 285), y=12993
Test data shape: X=(3614, 285), ids=3614





In [20]:
# --- Model Training ---

print("Setting up and training RandomForestClassifier...")

# Create a pipeline with scaling and the classifier
# StandardScaler is often recommended for SVM and Logistic Regression,
# less critical but can still be beneficial for RandomForest.
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=200,        # Number of trees
        random_state=42,        # For reproducibility
        n_jobs=-1,              # Use all available CPU cores
        max_depth=20,           # Limit tree depth to prevent overfitting
        min_samples_leaf=5,     # Minimum samples per leaf node
        class_weight={0: 1, 1: 4} # Adjust for potential class imbalance
        ))
])

# Train the model
model_pipeline.fit(X_train, y_train)

print("Training complete.")



Setting up and training RandomForestClassifier...
Training complete.


In [21]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
model_pipeline.fit(X_train_split, y_train_split)
y_pred_val = model_pipeline.predict(X_val)
print("\nValidation Set Performance:")
print(classification_report(y_val, y_pred_val))


Validation Set Performance:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      2096
           1       0.91      0.70      0.79       503

    accuracy                           0.93      2599
   macro avg       0.92      0.84      0.87      2599
weighted avg       0.93      0.93      0.92      2599



In [22]:
# --- Prediction and Submission ---

print("Generating predictions on the test set...")
test_predictions = model_pipeline.predict(X_test)

Generating predictions on the test set...


## K-Fold evaluation

In [None]:
from evaluation import evaluate_sklearn
avg_f1_score, std_f1_score = evaluate_sklearn(
    model_pipeline, 
    clips_tr, 
    DATA_ROOT / "train", 
    threshold=0.5,
    prefetch=True,
    signal_transform=hjorth_extractor,
    model_args=None,
    k_folds=5
)

Fold 1/5
Fold 1 F1 Score: 0.8029
Fold 2/5
Fold 2 F1 Score: 0.6931
Fold 3/5
Fold 3 F1 Score: 0.7009
Fold 4/5
Fold 4 F1 Score: 0.7097
Fold 5/5
Fold 5 F1 Score: 0.7489
Cross-Validation Average F1 Score: 0.7311
Cross-Validation F1 Score Standard Deviation: 0.0407


## Submission

In [23]:
# Format for submission
# Use the same ID correction function as in the user's example if needed
def remove_underlines(s):
    return s
    s = s.replace("___", "###")
    s = s.replace("_", "")
    s = s.replace("###", "_")
    return s

# Correct IDs if necessary (assuming test_ids are strings needing correction)
try:
    # Check if the first ID looks like it needs correction
    if isinstance(test_ids[0], str) and ("_" in test_ids[0]):
         print("Applying ID correction...")
         corrected_ids = [remove_underlines(i) for i in test_ids]
    else:
         print("IDs seem okay, not applying correction.")
         corrected_ids = test_ids # Use original IDs if they seem fine
except Exception as e:
    print(f"Warning: Could not process IDs for correction - {e}. Using raw IDs.")
    corrected_ids = test_ids


# Create submission DataFrame
submission_df = pd.DataFrame({'id': corrected_ids, 'label': test_predictions})

# Save submission file
submission_filename = "submission_hjorth_rf.csv"
submission_df.to_csv(submission_filename, index=False)

print(f"Kaggle submission file generated: {submission_filename}")

# --- Optional: Try other models ---
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
#
# print("\nTraining Logistic Regression...")
# lr_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced', C=0.1))])
# lr_pipeline.fit(X_train, y_train)
# print("Logistic Regression Training complete.")
# test_predictions_lr = lr_pipeline.predict(X_test)
# submission_df_lr = pd.DataFrame({'id': corrected_ids, 'label': test_predictions_lr})
# submission_df_lr.to_csv("submission_hjorth_lr.csv", index=False)
# print("Saved submission_hjorth_lr.csv")

# print("\nTraining SVM...")
# svm_pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', SVC(random_state=42, class_weight='balanced', C=1.0, gamma='scale'))]) # SVC can be slow
# svm_pipeline.fit(X_train, y_train)
# print("SVM Training complete.")
# test_predictions_svm = svm_pipeline.predict(X_test)
# submission_df_svm = pd.DataFrame({'id': corrected_ids, 'label': test_predictions_svm})
# submission_df_svm.to_csv("submission_hjorth_svm.csv", index=False)
# print("Saved submission_hjorth_svm.csv")

Applying ID correction...
Kaggle submission file generated: submission_hjorth_rf.csv
