# 1. Initial Steps and Data Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
DATA_DIRECTORY = './Data/hyperaktiv_with_controls/hyperaktiv_with_controls/'
VALID_IDs = [1, 3, 5, 11, 15, 19, 20, 21, 22, 23, 24, 32, 33, 34, 35, 36, 37, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 68, 71, 73, 75, 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 93, 94, 95, 97, 98, 101, 104, 105]

In [None]:
demographic_data = pd.read_csv(f'{DATA_DIRECTORY}patient_info.csv', sep=';')
#plot the balance of the ADHD class in the demographic_data for every record that has ID in VALID_IDs. Insert labels and make it more appealing
demographic_data = demographic_data[demographic_data['ID'].isin(VALID_IDs)]
# Extract labels for these IDs
labels = demographic_data['ADHD'].values

# Output the labels to verify
print(labels)

demographic_data['ADHD'].value_counts().plot(kind='bar', title='ADHD class balance in the dataset')
plt.xticks([0, 1], ['Control', 'ADHD'], rotation=0)
plt.ylabel('Number of records')
plt.show()

In [None]:
# I want to noe the IDS of the control and ADHD patients
control_ids = demographic_data[demographic_data['ADHD'] == 0]['ID'].values
adhd_ids = demographic_data[demographic_data['ADHD'] == 1]['ID'].values

print(f'Number of control patients: {len(control_ids)}; IDS: {control_ids}')
print(f'Number of ADHD patients: {len(adhd_ids)}; IDS: {adhd_ids}')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

def enhanced_split_dataset(ids, labels, train_ratio=0.80, val_ratio=0.20, random_seed=42):
    # Convert ratios to a useable format for StratifiedShuffleSplit
    splits = StratifiedShuffleSplit(n_splits=1, test_size=val_ratio, random_state=random_seed)

    # Split the data into train and validation sets
    train_ids, val_ids = next(splits.split(ids, labels))

    # Convert indices to actual IDs
    train_ids = ids[train_ids]
    val_ids = ids[val_ids]

    return train_ids, val_ids

# Example usage:
# Assume labels is an array of labels corresponding to VALID_IDs in the same order
train_ids, val_ids = enhanced_split_dataset(np.array(VALID_IDs), np.array(labels))

In [None]:
demographic_data_train = demographic_data[demographic_data['ID'].isin(train_ids)]
demographic_data_train['ADHD'].value_counts().plot(kind='bar', title='ADHD class balance in the train dataset')
plt.xticks([0, 1], ['Control', 'ADHD'], rotation=0)
plt.ylabel('Number of records')
plt.show()


demographic_data_val = demographic_data[demographic_data['ID'].isin(val_ids)]
demographic_data_val['ADHD'].value_counts().plot(kind='bar', title='ADHD class balance in the valdiation dataset')
plt.xticks([0, 1], ['Control', 'ADHD'], rotation=0)
plt.ylabel('Number of records')
plt.show()

In [None]:
def scale_data(data):
    scaler = RobustScaler()
    return scaler.fit_transform(data.reshape(-1, 1)).flatten()

In [None]:
def load_data(sample, demographic_data):
    patients_data = {}  # Dictionary to store data

    for patient_id in sample:
        hrv_data = pd.read_csv(f'{DATA_DIRECTORY}hrv_data_train/patient_hr_{patient_id}.csv', sep=';')
        activity_data = pd.read_csv(f'{DATA_DIRECTORY}activity_data_train/patient_activity_{patient_id}.csv', sep=';')
        labels =  demographic_data[demographic_data['ID'] == patient_id]['ADHD'].values[0]  # Get the ADHD label for the patient

    # Convert TIMESTAMP to datetime
        hrv_data['TIMESTAMP'] = pd.to_datetime(hrv_data['TIMESTAMP'], errors='coerce')
        activity_data['TIMESTAMP'] = pd.to_datetime(activity_data['TIMESTAMP'], errors='coerce')

    # Setting TIMESTAMP as index and checking for NaNs in data columns
        df_hrv = pd.DataFrame(data=hrv_data).set_index('TIMESTAMP')
        df_activity = pd.DataFrame(data=activity_data).set_index('TIMESTAMP')

    # Now resample for a 1 second interval
        df_hrv = df_hrv.resample('1S').mean()
        df_activity = df_activity.resample('1S').mean()

    # Fill NaNs in HRV and Activity before resampling
        df_hrv['HRV'] = df_hrv['HRV'].fillna(method='ffill')  # Forward fill as an example
        df_activity['ACTIVITY'] = df_activity['ACTIVITY'].fillna(method='ffill')  # Forward fill as an example

    # Scale data
        hrv_series = scale_data(df_hrv['HRV'].values)
        activity_series = scale_data(activity_data['ACTIVITY'].values)

    # Store in dictionary
        patients_data[patient_id] = {
        'hrv': hrv_series,
        'activity': activity_series,
        'adhd': labels
        }

    return patients_data


In [None]:

    

# def truncate_and_pad_sequences(sequence, max_length):
#     # Truncate if necessary and pad sequences that are too short
#     sequence = sequence[:max_length]  # Truncate if longer than max_length
#     if len(sequence) < max_length:
#         sequence = np.pad(sequence, (0, max_length - len(sequence)), 'constant', constant_values=0)  # Pad with zeros
#     return sequence


# def load_data(sample, demographic_data):
#     patients_data = {}

#     for patient_id in sample:
#         hrv_data = pd.read_csv(f'{DATA_DIRECTORY}hrv_data_train/patient_hr_{patient_id}.csv', sep=';')
#         activity_data = pd.read_csv(f'{DATA_DIRECTORY}activity_data_train/patient_activity_{patient_id}.csv', sep=';')
#         label = demographic_data[demographic_data['ID'] == patient_id]['ADHD'].values[0]

#         # Scale data
#         hrv_series = scale_data(hrv_data['HRV'].values)
#         activity_series = scale_data(activity_data['ACTIVITY'].values)

#         # Store in dictionary
#         patients_data[patient_id] = {
#             'hrv': hrv_series,
#             'activity': activity_series,
#             'adhd': label
#         }

#     return patients_data

In [None]:
# Sample usage with train_ids and demographic_data_train previously defined
train_data = load_data(train_ids, demographic_data=demographic_data_train)
val_data = load_data(val_ids, demographic_data=demographic_data_val)

all_data = {
    'train': train_data,
    'val': val_data,
}

In [None]:

# print a single sample to check data
train_sample_pre_trunc = all_data['train'][train_ids[0]]
print(train_sample_pre_trunc['hrv'])
print(train_sample_pre_trunc['activity'])
print(train_sample_pre_trunc['adhd'])

In [None]:

# print a single sample to check data
val_sample_pre_trunc = all_data['val'][val_ids[0]]
print(val_sample_pre_trunc['hrv'])
print(val_sample_pre_trunc['activity'])
print(val_sample_pre_trunc['adhd'])

In [None]:
# Example of calculating a percentile-based max_length
lengths = [len(data['hrv']) for patient_id, data in all_data['train'].items()]
max_length_train = int(np.percentile(lengths, 95))  # Using 95th percentile

max_length_train

In [None]:
val_lengths = [len(data['hrv']) for patient_id, data in all_data['val'].items()]
max_length_val = int(np.percentile(val_lengths, 95))  # Using 95th percentile

max_length_val

In [None]:
def truncate_and_pad_sequences(sequence, max_length):
    # Truncate if necessary and pad sequences that are too short
    sequence = sequence[:max_length]  # Truncate if longer than max_length
    if len(sequence) < max_length:
        sequence = np.pad(sequence, (0, max_length - len(sequence)), 'constant', constant_values=0)  # Pad with zeros
    return sequence


# Use the maximum value from both datasets
GLOBAL_MAX_LENGHT = max(max_length_train, max_length_val)
# Truncate and pad sequences for all patients in the dataset
for patient_id, data in all_data['train'].items():
    hrv_data = data['hrv']
    activity_data = data['activity']

    all_data['train'][patient_id]['hrv'] = truncate_and_pad_sequences(hrv_data, GLOBAL_MAX_LENGHT)
    all_data['train'][patient_id]['activity'] = truncate_and_pad_sequences(activity_data, GLOBAL_MAX_LENGHT)

for patient_id, data in all_data['val'].items():
    hrv_data = data['hrv']
    activity_data = data['activity']

    all_data['val'][patient_id]['hrv'] = truncate_and_pad_sequences(hrv_data, GLOBAL_MAX_LENGHT)
    all_data['val'][patient_id]['activity'] = truncate_and_pad_sequences(activity_data, GLOBAL_MAX_LENGHT)

In [None]:
# print a single sample to check data
train_sample_post_trunc = all_data['train'][train_ids[0]]
print(train_sample_post_trunc['hrv'])
print(train_sample_post_trunc['activity'])
print(train_sample_post_trunc['adhd'])


In [None]:
# print a single sample to check data
val_sample_post_trunc = all_data['val'][val_ids[0]]
print(val_sample_post_trunc['hrv'])
print(val_sample_post_trunc['activity'])
print(val_sample_post_trunc['adhd'])

In [None]:
class DualBranch1DCNN(nn.Module):
    def __init__(self, use_dropout=False, dropout_rate=0.5, use_batch_norm=False):
        super(DualBranch1DCNN, self).__init__()
        # Parameters for the conv layers (set statically here)
        kernel_size = 5
        stride = 2
        pool_size = 2
        self.hrv_branch = self.build_branch(use_dropout, dropout_rate, use_batch_norm, kernel_size, stride, pool_size)
        self.activity_branch = self.build_branch(use_dropout, dropout_rate, use_batch_norm, kernel_size, stride, pool_size)

        # Final layers
        self.final_layers = nn.Sequential(
            nn.Linear(256, 128),  # Adjusted assuming a COMMON_INPUT_LENGTH
            nn.ReLU()
        )
        if use_batch_norm:
            self.final_layers.add_module('final_batch_norm', nn.BatchNorm1d(128))
        if use_dropout:
            self.final_layers.add_module('final_dropout', nn.Dropout(dropout_rate))
        self.final_layers.add_module('final_output', nn.Linear(128, 1))  # Binary classification

    def build_branch(self, use_dropout, dropout_rate, use_batch_norm, kernel_size, stride, pool_size):
        layers = [
            nn.Conv1d(1, 16, kernel_size, stride=stride),
            nn.ReLU(),
            nn.MaxPool1d(pool_size),
            nn.Conv1d(16, 32, kernel_size, stride=stride),
            nn.ReLU(),
            nn.MaxPool1d(pool_size),
            nn.Flatten(),
            nn.Linear(32 * self.calculate_output_length(GLOBAL_MAX_LENGHT, kernel_size, stride, pool_size), 128),
            nn.ReLU()
        ]
        if use_batch_norm:
            layers.append(nn.BatchNorm1d(128))
        if use_dropout:
            layers.append(nn.Dropout(dropout_rate))
        return nn.Sequential(*layers)

    def calculate_output_length(self, input_length, kernel_size, stride, pool_size):
        output_length = input_length
        for _ in range(2):  # Two layers of conv+pool
            output_length = (output_length - (kernel_size - 1) - 1) // stride + 1
            output_length = (output_length - (pool_size - 1) - 1) // pool_size + 1
        return output_length

    def forward(self, hrv_data, activity_data):
        hrv_features = self.hrv_branch(hrv_data)
        activity_features = self.activity_branch(activity_data)
        combined_features = torch.cat((hrv_features, activity_features), dim=1)
        output = self.final_layers(combined_features)
        return output

In [None]:
class ADHDData(Dataset):
    def __init__(self, data):
        self.hrv_data = [torch.tensor(data[pid]['hrv'], dtype=torch.float32).unsqueeze(0) for pid in data]
        self.activity_data = [torch.tensor(data[pid]['activity'], dtype=torch.float32).unsqueeze(0) for pid in data]
        self.labels = [data[pid]['adhd'] for pid in data]
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        return self.hrv_data[index], self.activity_data[index], self.labels[index]

In [None]:
# Convert data dictionaries into datasets
train_dataset = ADHDData(all_data['train'])
val_dataset = ADHDData(all_data['val'])

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')

In [None]:
def train_model(model, train_loader, val_loader, device, config):
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) if config['optimizer_name'] == 'adam' \
                else torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config.get('momentum', 0.9))
    
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    best_val_accuracy = 0
    best_metrics = {}

    for epoch in range(config['num_epochs']):
        model.train()
        total = 0
        correct = 0
        train_loss = 0
        
        for hrv_data, activity_data, labels in train_loader:
            hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device).float()
            
            optimizer.zero_grad()
            outputs = model(hrv_data, activity_data)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predicted = torch.sigmoid(outputs).round()
            total += labels.size(0)
            correct += (predicted == labels.unsqueeze(1)).sum().item()

        train_accuracy = 100 * correct / total
        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(train_accuracy)

        # Validation
        val_loss, val_accuracy = validate_model(model, val_loader, criterion, device)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f'Epoch {epoch+1}/{config["num_epochs"]}:')
        print(f'  Training Loss: {train_loss / len(train_loader):.4f}, Training Accuracy: {train_accuracy:.2f}%')
        print(f'  Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

        # Save best model based on validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_metrics = {
                'train_loss': train_losses,
                'val_loss': val_losses,
                'train_accuracy': train_accuracies,
                'val_accuracy': val_accuracies
            }
            print(" Best model updated.")

    return best_metrics


def validate_model(model, val_loader, criterion, device):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for hrv_data, activity_data, labels in val_loader:
            hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device).float()
            outputs = model(hrv_data, activity_data)
            loss = criterion(outputs, labels.unsqueeze(1))
            val_loss += loss.item()
            predicted = torch.sigmoid(outputs).round()
            correct += (predicted == labels.unsqueeze(1)).sum().item()
            total += labels.size(0)
    
    accuracy = 100 * correct / total
    return val_loss / len(val_loader), accuracy


In [None]:
# configurations = [
#     {'config_id': 1, 'optimizer_name': 'adam', 'batch_size': 64, 'learning_rate': 0.001, 'num_epochs': 100, 'use_dropout': True, 'dropout_rate': 0.5, 'use_batch_norm': True},
#     {'config_id': 2, 'optimizer_name': 'adam', 'batch_size': 32, 'learning_rate': 0.01, 'num_epochs': 80, 'use_dropout': True, 'dropout_rate': 0.3, 'use_batch_norm': True},
#     {'config_id': 3, 'optimizer_name': 'adam', 'batch_size': 32, 'learning_rate': 0.01, 'num_epochs': 50, 'use_dropout': True, 'dropout_rate': 0.1, 'use_batch_norm': True},
#     {'config_id': 4, 'optimizer_name': 'sgd', 'batch_size': 32, 'learning_rate': 0.01, 'num_epochs': 30, 'use_dropout': True, 'dropout_rate': 0.1, 'use_batch_norm': True, 'momentum': 0.9},
#     {'config_id': 5, 'optimizer_name': 'adam', 'batch_size': 128, 'learning_rate': 0.05, 'num_epochs': 20, 'use_dropout': True, 'dropout_rate': 0.2, 'use_batch_norm': True},
#     {'config_id': 6, 'optimizer_name': 'sgd', 'batch_size': 16, 'learning_rate': 0.001, 'num_epochs': 40, 'use_dropout': True, 'dropout_rate': 0.5, 'use_batch_norm': True, 'momentum': 0.8}
# ]

In [None]:
import random
import itertools

def generate_configurations():
    # Define your hyperparameter space
    optimizer_names = ['adam', 'sgd']
    batch_sizes = [16, 32, 64, 128]
    learning_rates = [0.001, 0.01, 0.05]
    num_epochs = [20, 30, 40, 50, 60, 70]
    use_dropout = [True, False]
    dropout_rates = [0.1, 0.2, 0.3, 0.5]
    use_batch_norm = [True, False]
    momenta = [0.8, 0.9]  # Only for 'sgd'

    # Cartesian product to generate all combinations
    all_combinations = list(itertools.product(optimizer_names, batch_sizes, learning_rates, num_epochs,
                                              use_dropout, dropout_rates, use_batch_norm, momenta))
    
    # Convert tuples to dictionaries
    configurations = [
        {'optimizer_name': combo[0], 'batch_size': combo[1], 'learning_rate': combo[2], 'num_epochs': combo[3],
         'use_dropout': combo[4], 'dropout_rate': combo[5], 'use_batch_norm': combo[6], 'momentum': combo[7]}
        for combo in all_combinations if combo[0] == 'sgd' or combo[7] == None  # Adjusting momentum for optimizers
    ]

    return configurations

def sample_configurations(num_samples):
    configurations = generate_configurations()
    sampled_configurations = random.sample(configurations, min(num_samples, len(configurations)))  # Sample without replacement
    return sampled_configurations

# Example usage
N = 20  # Number of configurations you want
sampled_configurations = sample_configurations(N)

In [None]:
best_model = None
best_accuracy = 0
best_config = None

for config in sampled_configurations:
    print(f"Training with configuration: {config}")
    # Adjust data loader batch size based on configuration
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

    # Model initialization
    model = DualBranch1DCNN(use_dropout=config['use_dropout'],
                             dropout_rate=config['dropout_rate'],
                             use_batch_norm=config['use_batch_norm']).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) if config['optimizer_name'] == 'adam' \
                else optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'])

    # Assume implementation of train_and_validate function that returns validation accuracy
    val_accuracy = train_model(model, train_loader, val_loader, device, config)

    # Save the best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy    
        best_model = model
        best_config = config
        torch.save(model.state_dict(), f'best_model_config_{config["config_id"]}.pth')

print(f"Best Model Config: {best_config}, with Validation Accuracy: {best_accuracy}")