# EEG Eye State Classification

## Problem
We want to predict whether a subject's eyes are open or closed from EEG signals.
This is a binary classification problem where ML can learn patterns in the EEG channels.


## Setup

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


## Load data

In [None]:
df = pd.read_csv('../EEG-Eye-State.csv')
df.head()

## Basic EDA

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df['eyeDetection'].value_counts(normalize=True)

In [None]:
_ = df['eyeDetection'].value_counts().plot(kind='bar', title='Class balance')
plt.show()

## Train/test split

In [None]:
X = df.drop(columns=['eyeDetection'])
y = df['eyeDetection']
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
X_train.shape, X_val.shape, X_test.shape

## Baseline models

In [None]:
models = {
    'log_reg': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'random_forest': RandomForestClassifier(
        n_estimators=200, random_state=42, n_jobs=-1
    ),
    'gradient_boost': GradientBoostingClassifier(random_state=42),
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    proba = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else pred
    acc = accuracy_score(y_val, pred)
    auc = roc_auc_score(y_val, proba)
    results.append((name, acc, auc))

results

## Hyperparameter tuning

In [None]:
param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [None, 8, 16],
    'min_samples_split': [2, 5],
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid = GridSearchCV(rf, param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
grid.fit(X_train, y_train)
grid.best_params_, grid.best_score_

## Final evaluation

In [None]:
best_model = grid.best_estimator_
best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
pred = best_model.predict(X_test)
proba = best_model.predict_proba(X_test)[:, 1]
print('Accuracy:', accuracy_score(y_test, pred))
print('ROC AUC:', roc_auc_score(y_test, proba))
print(classification_report(y_test, pred))

## Save model

In [None]:
import joblib

joblib.dump(best_model, '../models/model.pkl')
meta = {
    'features': list(X.columns),
    'target': 'eyeDetection',
    'best_params': grid.best_params_,
}
with open('../models/metadata.json', 'w') as f:
    json.dump(meta, f, indent=2)
