In [1]:
import pyarrow.parquet as pq
from torch.utils.data import Dataset
import torch
from sklearn.impute import SimpleImputer
from torch.utils.data import DataLoader
from torchvision import transforms
import pandas as pd
import os
"""
# CAN RUN THIS FROM ANY NOTEBOOK
 
from spectrogram_preprocessor import *
from torch.utils.data import DataLoader
from torchvision import transforms

spectrogram_dataset = SpectrogramDataset("train", transform=transforms.Compose([
    MiddleCrop(), Impute(), LogTransform(), StackFrequencyBands()])
    )

dataloader = DataLoader(spectrogram_dataset, batch_size=32,
                        shuffle=True, num_workers=0)


for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched["values"].shape) #, "labels: ", sample_batched[1].shape)
    print(sample_batched["seizure_vote"].shape)
    print(sample_batched["lpd_vote"].shape)
    print(sample_batched["gpd_vote"].shape)
    print(sample_batched["lrda_vote"].shape)
    print(sample_batched["grda_vote"].shape)
    print(len(sample_batched["target"])) # for some reason target is a list
    # observe 4th batch and stop.
    if i_batch == 3:
        break

"""

class SpectrogramDataset(Dataset):
    """EEG spectrograms dataset."""

    def __init__(self, data_type, csv_file="/kaggle/input/hms-harmful-brain-activity-classification/train.csv", root_dir="/kaggle/input/hms-harmful-brain-activity-classification", transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_type = data_type
        if data_type == "train":
            self.data_path = root_dir + "/train_spectrograms"
            self.df_train = process_training_csv(csv_file)
        elif data_type == "test":
            self.data_path = root_dir + "/test_spectrograms"
            print(f"reading spectrograms from {self.data_path}")
            self.df_train = pd.read_csv(csv_file)
        self.transform = transform

    def reset(self):
        self.df_train = process_training_csv("hms-harmful-brain-activity-classification/train.csv")
        

    def __len__(self):
        return len(self.df_train)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if (self.data_type == "train"):
            parquet_path = os.path.join(self.data_path, str(self.df_train.iloc[idx]['spec_id']) + ".parquet")
            parquet_table = pq.read_table(parquet_path)

            sample = {"values" : parquet_table.to_pandas().values[:, 1:], # drop the time column
                "min" : self.df_train.iloc[idx]['min'],
                "max" : self.df_train.iloc[idx]['max']
                }
            if self.transform:
                sample = self.transform(sample)
                
            if sample["values"].shape[1]==0:
                return self.__getitem__(np.random.choice(len(self.data)))

            seizure_vote = self.df_train.iloc[idx]['seizure_vote']
            lpd_vote = self.df_train.iloc[idx]['lpd_vote']
            gpd_vote = self.df_train.iloc[idx]['gpd_vote']
            lrda_vote = self.df_train.iloc[idx]['lrda_vote']
            grda_vote = self.df_train.iloc[idx]['grda_vote']
            other_vote = self.df_train.iloc[idx]['other_vote']
            target = self.df_train.iloc[idx]['target']

            sample = {
                "values": sample["values"],
                "seizure_vote": seizure_vote,
                "lpd_vote": lpd_vote,
                "gpd_vote": gpd_vote,
                "lrda_vote": lrda_vote,
                "grda_vote": grda_vote,
                "other_vote": other_vote,
                "target": target
            }
        else:
            #spectrogram_id eeg_id patient_id
            parquet_path = os.path.join(self.data_path, str(self.df_train.iloc[idx]['spectrogram_id']) + ".parquet")
            parquet_table = pq.read_table(parquet_path)
            
            sample = {"values" : parquet_table.to_pandas().values[:, 1:], # drop the time column
                "min" : 0,
                "max" : 0
                }
            if self.transform:
                sample = self.transform(sample)
            
            sample = {
                "values": sample["values"],
                "patient_id": self.df_train.iloc[idx]['patient_id']
            }

        return sample


def process_training_csv(csv_file):
    """
    csv preprocessing from example notebook:
    """
    df = pd.read_csv(csv_file)
    TARGETS = df.columns[-6:]
    # Creating a Unique EEG Segment per eeg_id:
    # The code groups (groupby) the EEG data (df) by eeg_id. Each eeg_id represents a different EEG recording.
    # It then picks the first spectrogram_id and the earliest (min) spectrogram_label_offset_seconds for each eeg_id. This helps in identifying the starting point of each EEG segment.
    # The resulting DataFrame train has columns spec_id (first spectrogram_id) and min (earliest spectrogram_label_offset_seconds).
    train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
        {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
    train.columns = ['spec_id','min']
    # Finding the Latest Point in Each EEG Segment:
    # The code again groups the data by eeg_id and finds the latest (max) spectrogram_label_offset_seconds for each segment.
    # This max value is added to the train DataFrame, representing the end point of each EEG segment.
    tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
        {'spectrogram_label_offset_seconds':'max'})
    train['max'] = tmp
    # The code adds the patient_id for each eeg_id to the train DataFrame. This links each EEG segment to a specific patient.
    tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
    train['patient_id'] = tmp
    # The code sums up the target variable counts (like votes for seizure, LPD, etc.) for each eeg_id.
    tmp = df.groupby('eeg_id')[TARGETS].agg('sum') 
    for t in TARGETS:
        train[t] = tmp[t].values
    # It then normalizes these counts so that they sum up to 1. This step converts the counts into probabilities, which is a common practice in classification tasks.
    y_data = train[TARGETS].values 
    y_data = y_data / y_data.sum(axis=1,keepdims=True)
    train[TARGETS] = y_data
    # For each eeg_id, the code includes the expert_consensus on the EEG segment's classification.
    tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
    train['target'] = tmp
    # This makes eeg_id a regular column, making the DataFrame easier to work with.
    train = train.reset_index() 
    print('Train non-overlapp eeg_id shape:', train.shape)
    return train


class MiddleCrop(object):
    """Crop the spectrogram in a sample, centred in the middle.

    Args:
        output_size: Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size=300):
        self.output_size = output_size

    def __call__(self, sample):
        # //2 for average, //2 for 2 seconds per bin (min and max are in seconds, spectrogram is 2 seconds per value)
        start_from = int((sample["min"] + sample["max"]) // 4) 
        cropped = sample["values"][start_from:start_from+self.output_size, :]
        return {"values": cropped, "min": 0, "max": self.output_size*2}
    
class Impute(object):
    """
    replace NaNs with mean

    """

    def __init__(self):
        self.nan_imputer = SimpleImputer(strategy='mean')

    def __call__(self, sample):
        imputed = self.nan_imputer.fit_transform(sample["values"])
        return {"values": imputed, "min": sample["min"], "max": sample["max"]}
    
class StackFrequencyBands(object):
    """Stack the 4 frequency bands of the spectrogram in a sample.

    "Args:
        sample: 300x400 spectrogram
        returns: 4x300x100 spectrogram (band/channel, time, frequency)
    """
    def __call__(self, sample):
        values = sample["values"]
        split_arrays = np.array(np.split(values, 4, axis=1))
        return {
            "values": split_arrays,
                "min": sample["min"],
                "max": sample["max"]
        }

class LogTransform(object):
    """Apply log transformation to the spectrogram in a sample.

    Args:
        sample: 4x300x100 spectrogram (band/channel, time, frequency)
        returns: 4x300x100 spectrogram (band/channel, time, frequency)
    """
    def __call__(self, sample):
        values = sample["values"]
        log_transformed = np.log(values + 1)
        return {
            "values": log_transformed,
                "min": sample["min"],
                "max": sample["max"]
        }

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.init as init
import torch.nn.init as init
import gc
gc.collect()
torch.cuda.empty_cache()

"""
Ideas To Prevent Loss Nans
1. Normalize Data Better
2. Less Deep / Wide Architecture
3. CNN instead of FCNN
"""
class AE(torch.nn.Module):
    def __init__(self, numFrequencies, numRows, numFeatures=100):
        super().__init__()

        # Building a linear encoder with Batch Normalization
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(numFrequencies * numRows, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, numFeatures),
            torch.nn.ReLU(),
        )

        # Building a linear decoder with Batch Normalization
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(numFeatures, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, numFrequencies * numRows),
            torch.nn.Sigmoid()
        )

        # Apply Xavier initialization to the weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [3]:
"""
Ideas To Prevent Loss Nans
1. Normalize Data Better
2. Less Deep / Wide Architecture
3. CNN instead of FCNN
"""
class AEW(torch.nn.Module):
    def __init__(self, numFrequencies, numRows, numFeatures=1000):
        super().__init__()

        # Building a linear encoder with Batch Normalization
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(numFrequencies * numRows, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, numFeatures),
            torch.nn.ReLU(),
        )

        # Building a linear decoder with Batch Normalization
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(numFeatures, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, numFrequencies * numRows),
            torch.nn.Sigmoid()
        )

        # Apply Xavier initialization to the weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [4]:
alpha_frequencies = 21 * 4
delta_frequencies = 18 * 4
theta_frequencies = 20 * 4
beta_frequencies = 41 * 4
SPEC_FREQS = 400

numFeatures = 400

ALPHA_PATH = "/kaggle/input/alpha-band-autoencoder-model/model_alpha_latest.pth"
BETA_PATH = "/kaggle/input/beta-band-autoencoder-model/model_beta_latest.pth"
DELTA_PATH = "/kaggle/input/delta-band-autoencoder-model/model_delta_latest.pth"
THETA_PATH = "/kaggle/input/theta-band-autoencoder-model/model_theta_latest.pth"
TEN_MIN_PATH = "/kaggle/input/window10min-autoencoder-version2/model_ten_min_latest.pth"
TWENTY_SEC_PATH = "/kaggle/input/window20sec-autoencoder-version2/model_20sec_latest.pth"


In [5]:
from scipy import signal
PATH = "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/"
def extract_frequency_band_features(segment):
    
    cols = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:] # like LR_14.32
    channel_groups = ['LL', 'RL', 'LP', 'RP']
    
    eeg_bands = {'Delta': (0.5, 4), 'Theta': (4, 8), 'Alpha': (8, 12), 'Beta': (12, 30)}
    band_datapoints = {
        "Alpha": [],
        "Delta": [],
        "Theta": [],
        "Beta": [],
    }
    
    for channel_group in channel_groups:
        for band in eeg_bands:
            low, high = eeg_bands[band]
            # Filter signal for the specific band
            idxs = []
            for idx, col in enumerate(cols):
                if channel_group in col and float(col.split("_")[1]) <= high and float(col.split("_")[1]) >= low:
                    idxs.append(idx)
                        
            filtered = segment[:, idxs].flatten()
            band_datapoints[band].append(filtered)
    
    for band in band_datapoints:
        band_datapoints[band] = np.array(band_datapoints[band]).flatten() 
        # join all 4 group signals into one to reconstruct in autoencoder
    return band_datapoints

In [6]:
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    print('Use Multi GPU', device)
else:
    device = torch.device('cpu') 
    print("Use CPU")
    

# load model alpha from memory:
model_alpha = AE(alpha_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_alpha.to(device)
    model_alpha = nn.DataParallel(module=model_alpha)
    print('Use Multi GPU', device)
model_alpha.load_state_dict(torch.load(ALPHA_PATH))
model_alpha.eval()

# load model beta from memory:
model_beta = AE(beta_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_beta.to(device)
    model_beta = nn.DataParallel(module=model_beta)
    print('Use Multi GPU', device)
model_beta.load_state_dict(torch.load(BETA_PATH))
model_beta.eval()

# load model delta from memory:
model_delta = AE(delta_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_delta.to(device)
    model_delta = nn.DataParallel(module=model_delta)
    print('Use Multi GPU', device)
model_delta.load_state_dict(torch.load(DELTA_PATH))
model_delta.eval()

# load model theta from memory:
model_theta = AE(theta_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_theta.to(device)
    model_theta = nn.DataParallel(module=model_theta)
    print('Use Multi GPU', device)
model_theta.load_state_dict(torch.load(THETA_PATH))
model_theta.eval()

Use Multi GPU 0
Use Multi GPU 0
Use Multi GPU 0
Use Multi GPU 0
Use Multi GPU 0


DataParallel(
  (module): AE(
    (encoder): Sequential(
      (0): Linear(in_features=24000, out_features=2048, bias=True)
      (1): ReLU()
      (2): Linear(in_features=2048, out_features=2048, bias=True)
      (3): ReLU()
      (4): Linear(in_features=2048, out_features=2048, bias=True)
      (5): ReLU()
      (6): Linear(in_features=2048, out_features=400, bias=True)
      (7): ReLU()
    )
    (decoder): Sequential(
      (0): Linear(in_features=400, out_features=2048, bias=True)
      (1): ReLU()
      (2): Linear(in_features=2048, out_features=2048, bias=True)
      (3): ReLU()
      (4): Linear(in_features=2048, out_features=2048, bias=True)
      (5): ReLU()
      (6): Linear(in_features=2048, out_features=24000, bias=True)
      (7): Sigmoid()
    )
  )
)

In [7]:
device_window = torch.device('cpu')

# load model 10min from memory:
model_10min = AEW(SPEC_FREQS, 300, numFeatures=numFeatures)
model_10min = model_10min.to(device_window)
print('Use device: ', device_window)
model_10min.load_state_dict(torch.load(TEN_MIN_PATH))
model_10min.eval()

# load model 20sec from memory:
model_20sec = AEW(SPEC_FREQS, 10, numFeatures=numFeatures)
model_20sec = model_20sec.to(device_window)
print('Use device: ', device_window)
model_20sec.load_state_dict(torch.load(TWENTY_SEC_PATH))
model_20sec.eval()



Use device:  cpu
Use device:  cpu


AEW(
  (encoder): Sequential(
    (0): Linear(in_features=4000, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2048, out_features=2048, bias=True)
    (5): ReLU()
    (6): Linear(in_features=2048, out_features=400, bias=True)
    (7): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=400, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2048, out_features=2048, bias=True)
    (5): ReLU()
    (6): Linear(in_features=2048, out_features=4000, bias=True)
    (7): Sigmoid()
  )
)

In [8]:
"""
Get Feature Data
"""
batch_size = 100

FEATURES = ["feature_{}_alpha".format(i) for i in range(numFeatures)]
FEATURES += ["feature_{}_beta".format(i) for i in range(numFeatures)]
FEATURES += ["feature_{}_delta".format(i) for i in range(numFeatures)]
FEATURES += ["feature_{}_theta".format(i) for i in range(numFeatures)]
FEATURES += ["feature_{}_10min".format(i) for i in range(numFeatures)]
FEATURES += ["feature_{}_20sec".format(i) for i in range(numFeatures)]


spectrogram_dataset = SpectrogramDataset("train", transform=transforms.Compose([
    MiddleCrop(), Impute(), LogTransform()])
    )

dataloader = DataLoader(spectrogram_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=2)

print(f"Generating {len(FEATURES)} features on {len(spectrogram_dataset)} datapoints")
# train data
data = np.zeros((len(spectrogram_dataset), len(FEATURES)))
# train labels
labels = np.zeros((len(spectrogram_dataset), 6))

for k, batch in enumerate(dataloader):
    print("batch ", k, end="... ")
    b_size = batch["values"].shape[0] # this batch's size may be different from batch_size (mainly for last batch)

    labels[k*batch_size:k*batch_size + b_size, 0] = batch["seizure_vote"]
    labels[k*batch_size:k*batch_size + b_size, 1] = batch["lpd_vote"]
    labels[k*batch_size:k*batch_size + b_size, 2] = batch["gpd_vote"]
    labels[k*batch_size:k*batch_size + b_size, 3] = batch["lrda_vote"]
    labels[k*batch_size:k*batch_size + b_size, 4] = batch["grda_vote"]
    labels[k*batch_size:k*batch_size + b_size, 5] = batch["other_vote"]
    
    
    for i, eeg_segment in enumerate(batch["values"]):
        
        signals = {
            "Alpha": [], "Beta": [], "Delta": [], "Theta": []
        }
    
        signals = extract_frequency_band_features(eeg_segment)
            
        vals = signals["Alpha"]
        if len(vals) == alpha_frequencies * 300:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_alpha.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, 0:numFeatures] = x
        
        vals = signals["Beta"]
        if len(vals) == beta_frequencies * 300:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_beta.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, numFeatures:2*numFeatures] = x
        
        vals = signals["Delta"]
        if len(vals) == delta_frequencies * 300:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_delta.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, 2*numFeatures:3*numFeatures] = x
        
        vals = signals["Theta"]
        if len(vals) == theta_frequencies * 300 * 4:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_theta.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, 3*numFeatures:4*numFeatures] = x
        
        # 10 MINUTE WINDOW FEATURES
        raw_values_10min = eeg_segment.flatten()
        normalized_values_10min = (raw_values_10min - raw_values_10min.min()) / (raw_values_10min.max() - raw_values_10min.min())
        x = np.array(model_10min.encoder(torch.tensor(np.array(normalized_values_10min).reshape(1,-1), dtype=torch.float32).to(device_window)).tolist())    
        data[k*batch_size + i,4*numFeatures:5*numFeatures] = x

        # 20 SECOND WINDOW FEATURES
        raw_values_20sec = eeg_segment[145:155].flatten()
        normalized_values_20sec = (raw_values_20sec - raw_values_20sec.min()) / (raw_values_20sec.max() - raw_values_20sec.min())
        x = np.array(model_20sec.encoder(torch.tensor(np.array(normalized_values_20sec).reshape(1,-1), dtype=torch.float32).to(device_window)).tolist())
        data[k*batch_size + i,5*numFeatures:6*numFeatures] = x

#train[FEATURES] = data

print('New train shape: ', data.shape)
print('Labels shape: ', labels.shape)

Train non-overlapp eeg_id shape: (17089, 12)
Generating 2400 features on 17089 datapoints
batch  0... batch  1... batch  2... batch  3... batch  4... batch  5... batch  6... batch  7... batch  8... batch  9... batch  10... batch  11... batch  12... batch  13... batch  14... batch  15... batch  16... batch  17... batch  18... batch  19... batch  20... batch  21... batch  22... batch  23... batch  24... batch  25... batch  26... batch  27... batch  28... batch  29... batch  30... batch  31... batch  32... batch  33... batch  34... batch  35... batch  36... batch  37... batch  38... batch  39... batch  40... batch  41... batch  42... batch  43... batch  44... batch  45... batch  46... batch  47... batch  48... batch  49... batch  50... batch  51... batch  52... batch  53... batch  54... batch  55... batch  56... batch  57... batch  58... batch  59... batch  60... batch  61... batch  62... batch  63... batch  64... batch  65... batch  66... batch  67... batch  68... batch  69... batch  70.

In [9]:
del model_alpha, model_beta, model_delta, model_theta, model_10min, model_20sec
torch.cuda.empty_cache()

In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the features and transform them
features_scaled = scaler.fit_transform(data)

# Create a DataFrame from the scaled features
train_scaled_df = pd.DataFrame(features_scaled)
train_labels_df = pd.DataFrame(labels)

train_scaled_df.to_csv("/kaggle/working/train_scaled.csv")
train_labels_df.to_csv("/kaggle/working/train_labels.csv")

In [11]:
train_scaled_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2390,2391,2392,2393,2394,2395,2396,2397,2398,2399
count,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,...,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0
mean,-4.4281590000000005e-17,0.0,8.315790999999999e-19,-1.496842e-17,8.107896e-18,2.494737e-18,0.0,-1.455263e-18,4.9063170000000005e-17,0.0,...,0.0,0.0,1.6007900000000002e-17,0.0,0.0,0.0,0.0,4.9063170000000005e-17,0.0,0.0
std,1.000029,0.0,1.000029,1.000029,1.000029,1.000029,0.0,1.000029,1.000029,0.0,...,0.0,0.0,1.000029,0.0,0.0,0.0,0.0,1.000029,0.0,0.0
min,-0.1223781,0.0,-0.0118079,-0.4594322,-0.007649876,-0.01883016,0.0,-0.0144537,-0.1732432,0.0,...,0.0,0.0,-1.084701,0.0,0.0,0.0,0.0,-0.9271445,0.0,0.0
25%,-0.1223781,0.0,-0.0118079,-0.4594322,-0.007649876,-0.01883016,0.0,-0.0144537,-0.1732432,0.0,...,0.0,0.0,-0.7159386,0.0,0.0,0.0,0.0,-0.7851902,0.0,0.0
50%,-0.1223781,0.0,-0.0118079,-0.4594322,-0.007649876,-0.01883016,0.0,-0.0144537,-0.1732432,0.0,...,0.0,0.0,-0.1873895,0.0,0.0,0.0,0.0,-0.2279119,0.0,0.0
75%,-0.1223781,0.0,-0.0118079,-0.002599162,-0.007649876,-0.01883016,0.0,-0.0144537,-0.1732432,0.0,...,0.0,0.0,0.4255914,0.0,0.0,0.0,0.0,0.4364051,0.0,0.0
max,29.06922,0.0,114.1936,13.00626,130.7211,90.81096,0.0,86.4597,26.42697,0.0,...,0.0,0.0,10.07905,0.0,0.0,0.0,0.0,10.90353,0.0,0.0


In [12]:
train_labels_df.describe()

Unnamed: 0,0,1,2,3,4,5
count,17089.0,17089.0,17089.0,17089.0,17089.0,17089.0
mean,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
std,0.331563,0.295541,0.258825,0.187005,0.271425,0.418454
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.333333
75%,0.0,0.068966,0.0,0.0,0.0,0.941176
max,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
import xgboost as xgb
from sklearn.svm import SVC
import gc
from sklearn.model_selection import KFold, GroupKFold
import pickle

In [14]:
print("features: ", len(FEATURES))
print(train_scaled_df.shape)
print(train_labels_df.shape)

features:  2400
(17089, 2400)
(17089, 6)


In [15]:
from sklearn.multioutput import MultiOutputRegressor
from scipy.stats import entropy
import joblib

all_oof = []
all_true = []

n_splits = 5
gkf = GroupKFold(n_splits=5)
kf = KFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(kf.split(train_scaled_df)):   
    if i >= n_splits:
        continue
    print('#'*25)
    print(f'### Fold {i+1}')
    print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
    print('#'*25)
    
    # Instantiate the XGBRegressor model
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate = 0.1) # uses MSE to predict probabilities

    model = MultiOutputRegressor(xgb_model) # since we have multiple outputs

    
    # Prepare training and validation data
    TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}
    X_train = train_scaled_df.loc[train_index]
    y_train = train_labels_df.loc[train_index]

    X_valid = train_scaled_df.loc[valid_index]
    y_valid = train_labels_df.loc[valid_index]

    # Fit the model
    model.fit(X_train, y_train)
    
    # save model
    joblib.dump(model, f"/kaggle/working/multi_regressor_{i}.pkl") 

    y_pred = model.predict(X_valid)
    y_pred[y_pred < 0] = 0

    oof = y_pred / np.sum(y_pred, axis=1).reshape(-1,1) # ensure they sum to 1
    #adjustment = 1 - np.sum(oof, axis=1)
    #oof[:, 0] += adjustment
    
    true = y_valid.values

    KL_divergence = np.mean(np.sum(true * (np.log(true + 1e-10) - np.log(oof + 1e-10)), axis=1))
    print("KL divergence: ", KL_divergence)

    
    all_oof.append(oof)
    all_true.append(true)
    
    del X_train, y_train, X_valid, y_valid, oof
    gc.collect()
    
all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

#########################
### Fold 1
### train size 13671, valid size 3418
#########################
KL divergence:  0.7560883088372607
#########################
### Fold 2
### train size 13671, valid size 3418
#########################
KL divergence:  0.7723165561193774
#########################
### Fold 3
### train size 13671, valid size 3418
#########################
KL divergence:  0.7509126041568722
#########################
### Fold 4
### train size 13671, valid size 3418
#########################
KL divergence:  0.8173506473712132
#########################
### Fold 5
### train size 13672, valid size 3417
#########################
KL divergence:  0.7707770461161145


In [16]:
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
print('Test shape',test.shape)
test.head()

Test shape (1, 3)


Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,853520,3911565283,6885


In [17]:
# load model alpha from memory:
model_alpha = AE(alpha_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_alpha.to(device)
    model_alpha = nn.DataParallel(module=model_alpha)
    print('Use Multi GPU', device)
model_alpha.load_state_dict(torch.load(ALPHA_PATH))
model_alpha.eval()

# load model beta from memory:
model_beta = AE(beta_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_beta.to(device)
    model_beta = nn.DataParallel(module=model_beta)
    print('Use Multi GPU', device)
model_beta.load_state_dict(torch.load(BETA_PATH))
model_beta.eval()

# load model delta from memory:
model_delta = AE(delta_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_delta.to(device)
    model_delta = nn.DataParallel(module=model_delta)
    print('Use Multi GPU', device)
model_delta.load_state_dict(torch.load(DELTA_PATH))
model_delta.eval()

# load model theta from memory:
model_theta = AE(theta_frequencies, 300, numFeatures=numFeatures)
if torch.cuda.device_count() > 1:
    device = torch.cuda.current_device()
    model_theta.to(device)
    model_theta = nn.DataParallel(module=model_theta)
    print('Use Multi GPU', device)
model_theta.load_state_dict(torch.load(THETA_PATH))
model_theta.eval()

Use Multi GPU 0
Use Multi GPU 0
Use Multi GPU 0
Use Multi GPU 0


DataParallel(
  (module): AE(
    (encoder): Sequential(
      (0): Linear(in_features=24000, out_features=2048, bias=True)
      (1): ReLU()
      (2): Linear(in_features=2048, out_features=2048, bias=True)
      (3): ReLU()
      (4): Linear(in_features=2048, out_features=2048, bias=True)
      (5): ReLU()
      (6): Linear(in_features=2048, out_features=400, bias=True)
      (7): ReLU()
    )
    (decoder): Sequential(
      (0): Linear(in_features=400, out_features=2048, bias=True)
      (1): ReLU()
      (2): Linear(in_features=2048, out_features=2048, bias=True)
      (3): ReLU()
      (4): Linear(in_features=2048, out_features=2048, bias=True)
      (5): ReLU()
      (6): Linear(in_features=2048, out_features=24000, bias=True)
      (7): Sigmoid()
    )
  )
)

In [18]:
device_window = torch.device('cpu')

# load model 10min from memory:
model_10min = AEW(SPEC_FREQS, 300, numFeatures=numFeatures)
model_10min = model_10min.to(device_window)
print('Use device: ', device_window)
model_10min.load_state_dict(torch.load(TEN_MIN_PATH))
model_10min.eval()

# load model 20sec from memory:
model_20sec = AEW(SPEC_FREQS, 10, numFeatures=numFeatures)
model_20sec = model_20sec.to(device_window)
print('Use device: ', device_window)
model_20sec.load_state_dict(torch.load(TWENTY_SEC_PATH))
model_20sec.eval()

Use device:  cpu
Use device:  cpu


AEW(
  (encoder): Sequential(
    (0): Linear(in_features=4000, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2048, out_features=2048, bias=True)
    (5): ReLU()
    (6): Linear(in_features=2048, out_features=400, bias=True)
    (7): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=400, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2048, out_features=2048, bias=True)
    (5): ReLU()
    (6): Linear(in_features=2048, out_features=4000, bias=True)
    (7): Sigmoid()
  )
)

In [19]:
"""
Get Feature Data
"""
%time
# ENGINEER FEATURES
import warnings
warnings.filterwarnings('ignore')

batch_size = 32
TEST_FEATURES = FEATURES

spectrogram_dataset = SpectrogramDataset("test", csv_file="/kaggle/input/hms-harmful-brain-activity-classification/test.csv",
                                         root_dir="/kaggle/input/hms-harmful-brain-activity-classification", 
                                         transform=transforms.Compose([
                                            Impute(), LogTransform()])
                                        )

dataloader = DataLoader(spectrogram_dataset, batch_size=batch_size,
                        shuffle=False, num_workers=1)

    
print(f"Generating {6 * numFeatures} features on {len(spectrogram_dataset)} datapoints")


# test data
data = np.zeros((len(spectrogram_dataset), len(TEST_FEATURES)))

for k, batch in enumerate(dataloader):
    print("batch ", k, end="... ")
    b_size = batch["values"].shape[0] # this batch's size may be different from batch_size (mainly for last batch)
    
    
    for i, eeg_segment in enumerate(batch["values"]):
        
        signals = {
            "Alpha": [], "Beta": [], "Delta": [], "Theta": []
        }
    
        signals = extract_frequency_band_features(eeg_segment)
            
        vals = signals["Alpha"]
        if len(vals) == alpha_frequencies * 300:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_alpha.module.encoder(torch.tensor([norm_vals], dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, 0:numFeatures] = x
        
        vals = signals["Beta"]
        if len(vals) == beta_frequencies * 300:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_beta.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, numFeatures:2*numFeatures] = x
        
        vals = signals["Delta"]
        if len(vals) == delta_frequencies * 300:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_delta.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, 2*numFeatures:3*numFeatures] = x
        
        vals = signals["Theta"]
        if len(vals) == theta_frequencies * 300 * 4:
            norm_vals = (vals - vals.min()) / (vals.max() - vals.min())
            x = np.array(model_theta.module.encoder(torch.tensor(np.array(norm_vals).reshape(1,-1), dtype=torch.float32).to(device)).tolist())
        else:
            x = np.array([0 for i in range(numFeatures)])
        data[k*batch_size + i, 3*numFeatures:4*numFeatures] = x
        
        # 10 MINUTE WINDOW FEATURES
        raw_values_10min = eeg_segment.flatten()
        normalized_values_10min = (raw_values_10min - raw_values_10min.min()) / (raw_values_10min.max() - raw_values_10min.min())
        x = np.array(model_10min.encoder(torch.tensor(np.array(normalized_values_10min).reshape(1,-1), dtype=torch.float32).to(device_window)).tolist())    
        data[k*batch_size + i,4*numFeatures:5*numFeatures] = x

        # 20 SECOND WINDOW FEATURES
        raw_values_20sec = eeg_segment[145:155].flatten()
        normalized_values_20sec = (raw_values_20sec - raw_values_20sec.min()) / (raw_values_20sec.max() - raw_values_20sec.min())
        x = np.array(model_20sec.encoder(torch.tensor(np.array(normalized_values_20sec).reshape(1,-1), dtype=torch.float32).to(device_window)).tolist())
        data[k*batch_size + i,5*numFeatures:6*numFeatures] = x    


print('New test shape: ', data.shape)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs
reading spectrograms from /kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms
Generating 2400 features on 1 datapoints
batch  0... New test shape:  (1, 2400)


In [20]:
del model_alpha, model_beta, model_delta, model_theta, model_10min, model_20sec
torch.cuda.empty_cache()

In [21]:
from sklearn.preprocessing import StandardScaler

# Fit the scaler to the features and transform them
features_scaled = scaler.transform(data)

# Create a DataFrame from the scaled features
test_scaled_df = pd.DataFrame(features_scaled)

test_scaled_df.to_csv("/kaggle/working/test_scaled.csv")

In [22]:
# INFER XGBOOST ON TEST
preds = []

for i in range(n_splits):
    print(i, ', ', end='')
    
    # Load the model
    model = joblib.load(f"multi_regressor_{i}.pkl")

    # Make predictions
    pred = model.predict(test_scaled_df)
    pred[pred<0] = 0
    pred = pred / np.sum(pred, axis=1).reshape(-1,1) # ensure they sum to 1
#     adjustment = 1 - np.sum(pred, axis=1)
#     pred[:, 0] += adjustment
    
    preds.append(pred)
    

# Average the predictions from each fold
pred = np.mean(preds, axis=0)
print()
print('Test preds shape', pred.shape)

0 , 1 , 2 , 3 , 4 , 
Test preds shape (1, 6)


In [23]:
TARGETS = ['seizure_vote', 'lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = pred
sub.to_csv('submission.csv',index=False)
print('Submission shape',sub.shape)
sub.head()

Submission shape (1, 7)


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.15119,0.045008,0.002161,0.161519,0.020744,0.619378


In [24]:
# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
sub.iloc[:,-6:].sum(axis=1)

0    1.0
dtype: float32