In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from brain_solver import helpers as hp
from brain_solver import trainer as tr
from brain_solver import EEGDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import os
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from sklearn import model_selection
import torchvision.transforms as transforms
import torchvision.io
from PIL import Image
import torch.multiprocessing as mp
import warnings
import pandas as pd, numpy as np, os
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    BackboneFinetuning,
    EarlyStopping,
)

VER = 3

## General Variables for the Project
Some variables need to be set on each machine to make sure that the project will run correctly every time.

In [2]:
class Config:
    data_path = (
        "/mnt/hdd_library/Kaggle/hms-harmful-brain-activity-classification/"
    )
    data_train_csv = data_path + "train.csv"
    data_eeg = data_path + "train_eegs/"
    data_output = './out'


    use_aug = False
    num_classes = 6
    batch_size = 88
    epochs = 20
    PRECISION = 16    
    PATIENCE = 20    
    seed = 2024
    pretrained = False            
    weight_decay = 1e-2
    use_mixup = False
    mixup_alpha = 0.1   
    num_channels = 8
    LR = 8e-3
    processed_train = None
    trn_folds = [0, 1, 2, 3, 4]

In [3]:
# Create out folder
if not os.path.exists(Config.data_output):
    os.makedirs(Config.data_output)

pl.seed_everything(Config.seed, workers=True)

Seed set to 2024


2024

In [4]:
def config_to_dict(cfg):
    return dict(
        (name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith("__")
    )

In [5]:
df :pd.DataFrame = hp.Helpers.load_csv(Config.data_train_csv)

if df is not None:
    print(df.shape)
else:
    print("Failed to load the CSV file.")

(106800, 15)


In [6]:
EEG_IDS = df.eeg_id.unique()

TARGETS = df.columns[-6:]
TARS = {"Seizure": 0, "LPD": 1, "GPD": 2, "LRDA": 3, "GRDA": 4, "Other": 5}
TARS_INV = {x: y for y, x in TARS.items()}

train = df.groupby("eeg_id")[["patient_id"]].agg("first")

tmp = df.groupby("eeg_id")[TARGETS].agg("sum")
for t in TARGETS:
    train[t] = tmp[t].values

y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1, keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby("eeg_id")[["expert_consensus"]].agg("first")
train["target"] = tmp

train = train.reset_index()
train = train.loc[train.eeg_id.isin(EEG_IDS)]
print("Train Data with unique eeg_id shape:", train.shape)

Train Data with unique eeg_id shape: (17089, 9)


In [9]:
train.head()

Unnamed: 0,eeg_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


In [10]:
Config.num_classes = len(TARS.keys())

In [11]:
train.describe()

Unnamed: 0,eeg_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
count,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0
mean,2135226000.0,32839.981977,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
std,1235712000.0,18351.751174,0.331563,0.295541,0.258825,0.187005,0.271425,0.418454
min,568657.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1062096000.0,17408.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2123560000.0,32068.0,0.0,0.0,0.0,0.0,0.0,0.333333
75%,3208261000.0,48272.0,0.0,0.068966,0.0,0.0,0.0,0.941176
max,4294958000.0,65494.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
CREATE_EEGS = True

df = pd.read_parquet(f"{Config.data_path}train_eegs/1000913311.parquet")
FEATS = df.columns
print(f"There are {len(FEATS)} raw eeg features")
print(list(FEATS))

if Config.data_eeg is not None:
    raw_eegs = np.load(Config.data_eeg, allow_pickle=True).item()
else:

    all_eegs = {}
    DISPLAY = 4
    EEG_IDS = train.eeg_id.unique()
    PATH = f"{Config.data_path}train_eegs/"

    for i, eeg_id in enumerate(EEG_IDS):
        if (i % 100 == 0) & (i != 0):
            print(i, ", ", end="")

        # SAVE EEG TO PYTHON DICTIONARY OF NUMPY ARRAYS
        data = hp.Helpers.eeg_from_parquet(
            f"{PATH}{eeg_id}.parquet", FEATS=FEATS, display=i < DISPLAY
        )
        all_eegs[eeg_id] = data

        if i == DISPLAY:
            if CREATE_EEGS:
                print(f"Processing {train.eeg_id.nunique()} eeg parquets... ", end="")
            else:
                print(f"Reading {len(EEG_IDS)} eeg NumPys from disk.")
                break

    if CREATE_EEGS:
        np.save(f"{Config.data_path}eegs_20ch", all_eegs)

There are 20 raw eeg features
['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']


IsADirectoryError: [Errno 21] Is a directory: '/mnt/hdd_library/Kaggle/hms-harmful-brain-activity-classification/train_eegs/'

In [None]:
len(raw_eegs)

In [None]:
raw_eegs[train.loc[0, "eeg_id"]].shape

## Create ResNet18 model
Resnet expects 3 layers

In [None]:
# Model creation
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True) # Can change to 152 or some other version

Using cache found in /home/luppo/.cache/torch/hub/pytorch_vision_v0.10.0


In [None]:
dataset = EEGDataset(
    df=train, data_eeg=data_eeg, mode="train"
)

data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
trainer = tr.Trainer(
    model=model,
    data_loaders={
        "train": data_loader,
        "val": data_loader,
    },  # Consider separating train and val datasets
    lr=0.01,
    num_classes=len(
        np.unique(train["target_encoded"])
    ),  # Ensure this matches the unique labels count
)

trainer.train(epochs=10)

  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).unsqueeze(0)  # Add a channel dimension if necessary
  X = torch.tensor(X).uns

RuntimeError: The size of tensor a (64) must match the size of tensor b (6) at non-singleton dimension 1