In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [None]:

SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
DATA_DIRECTORY = './Data/hyperaktiv_with_controls/hyperaktiv_with_controls/'
VALID_IDs = [1, 3, 5, 11, 15, 19, 20, 21, 22, 23, 24, 32, 33, 34, 35, 36, 37, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 68, 71, 73, 75, 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 93, 94, 95, 97, 98, 101, 104, 105]

In [None]:
demographic_data = pd.read_csv(f'{DATA_DIRECTORY}patient_info.csv', sep=';')
#plot the balance of the ADHD class in the demographic_data for every record that has ID in VALID_IDs. Insert labels and make it more appealing
demographic_data = demographic_data[demographic_data['ID'].isin(VALID_IDs)]
# Extract labels for these IDs
labels = demographic_data['ADHD'].values

# Output the labels to verify
print(labels)

demographic_data['ADHD'].value_counts().plot(kind='bar', title='ADHD class balance in the dataset')
plt.xticks([0, 1], ['Control', 'ADHD'], rotation=0)
plt.ylabel('Number of records')
plt.show()

In [None]:
# I want to noe the IDS of the control and ADHD patients
control_ids = demographic_data[demographic_data['ADHD'] == 0]['ID'].values
adhd_ids = demographic_data[demographic_data['ADHD'] == 1]['ID'].values

print(f'Number of control patients: {len(control_ids)}; IDS: {control_ids}')
print(f'Number of ADHD patients: {len(adhd_ids)}; IDS: {adhd_ids}')

In [None]:

def scale_data(data):
    scaler = RobustScaler()
    return scaler.fit_transform(data.reshape(-1, 1)).flatten()

In [None]:
def collate_fn(batch):
    # Extract hrv_data, activity_data, and labels from the batch
    hrv_data = [item[0] for item in batch]
    activity_data = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    # Pad sequences so they are all the same length within each batch
    # Find the maximum length of any sequence in the batch for hrv_data and activity_data
    max_length_hrv = max([s.size(1) for s in hrv_data])  # size(1) because the dimension 0 is the batch dimension
    max_length_activity = max([s.size(1) for s in activity_data])

    # Pad all sequences to the maximum length found
    hrv_data = [F.pad(seq, (0, max_length_hrv - seq.size(1))) for seq in hrv_data]
    activity_data = [F.pad(seq, (0, max_length_activity - seq.size(1))) for seq in activity_data]

    # Convert lists to tensors
    hrv_data = torch.stack(hrv_data)
    activity_data = torch.stack(activity_data)
    labels = torch.tensor(labels, dtype=torch.float32)

    return hrv_data, activity_data, labels

In [None]:
class VariableLengthDataset(Dataset):
    def __init__(self, data):
        self.keys = list(data.keys())  # Store the keys
        self.data = data

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        # Use the stored keys to handle indexing
        key = self.keys[idx]
        item = self.data[key]
        hrv_data = torch.tensor(item['hrv'], dtype=torch.float32).unsqueeze(0)  # Ensure channel dimension is added
        activity_data = torch.tensor(item['activity'], dtype=torch.float32).unsqueeze(0)
        label = torch.tensor(item['adhd'], dtype=torch.float32)
        return hrv_data, activity_data, label


In [None]:
def load_data(sample, demographic_data, is_train=True):
    data = {}
    suffix = 'train' if is_train else 'test'
    for patient_id in sample:
        hrv_data = pd.read_csv(f'{DATA_DIRECTORY}/hrv_data_{suffix}/patient_hr_{patient_id}.csv', sep=';')['HRV'].values
        activity_data = pd.read_csv(f'{DATA_DIRECTORY}/activity_data_{suffix}/patient_activity_{patient_id}.csv', sep=';')['ACTIVITY'].values
        label = demographic_data[demographic_data['ID'] == patient_id]['ADHD'].values[0]

        hrv_scaled = scale_data(hrv_data)
        activity_scaled = scale_data(activity_data)

        data[patient_id] = {'hrv': hrv_scaled, 'activity': activity_scaled, 'adhd': label}
    return data


all_data = load_data(VALID_IDs, demographic_data=demographic_data)

In [None]:
all_data

In [None]:
class DualBranch1DCNN(nn.Module):
    def __init__(self, num_channels, use_dropout=False, dropout_rate=0.5, use_batch_norm=False):
        super(DualBranch1DCNN, self).__init__()
        self.hrv_branch = self.create_branch(num_channels, use_dropout, dropout_rate, use_batch_norm)
        self.activity_branch = self.create_branch(num_channels, use_dropout, dropout_rate, use_batch_norm)
        # Assuming both branches are concatenated, thus doubling the feature maps
        final_input_features = 2 * num_channels  # Adjust according to actual output of branches
        self.final_layers = nn.Sequential(
            nn.Linear(final_input_features, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128) if use_batch_norm else nn.Identity(),  # Ensure this matches the output of the Linear layer
            nn.Linear(128, 1)
        )

    def create_branch(self, num_channels, use_dropout, dropout_rate, use_batch_norm):
        layers = [
            nn.Conv1d(1, num_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_channels) if use_batch_norm else nn.Identity(),
            nn.ReLU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_channels) if use_batch_norm else nn.Identity(),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(num_channels, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128) if use_batch_norm else nn.Identity(),  # Correct placement and dimension
            nn.Dropout(dropout_rate) if use_dropout else nn.Identity()
        ]
        return nn.Sequential(*layers)

    def forward(self, hrv_data, activity_data):
        hrv_features = self.hrv_branch(hrv_data)
        activity_features = self.activity_branch(activity_data)
        combined_features = torch.cat((hrv_features, activity_features), dim=1)
        output = self.final_layers(combined_features)
        return output

def train_model(model, train_loader, val_loader, device, config):
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) if config['optimizer_name'] == 'adam' \
                else torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config.get('momentum', 0.9))

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    best_val_accuracy = 0
    best_metrics = {}

    for epoch in range(config['num_epochs']):
        model.train()
        total = 0
        correct = 0
        train_loss = 0

        for hrv_data, activity_data, labels in train_loader:
            hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device).float()

            optimizer.zero_grad()
            outputs = model(hrv_data, activity_data)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            predicted = torch.sigmoid(outputs).round()
            total += labels.size(0)
            correct += (predicted == labels.unsqueeze(1)).sum().item()

        train_accuracy = 100 * correct / total
        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(train_accuracy)

        # Validation
        val_loss, val_accuracy = validate_model(model, val_loader, criterion, device)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f'Epoch {epoch+1}/{config["num_epochs"]}:')
        print(f'  Training Loss: {train_loss / len(train_loader):.4f}, Training Accuracy: {train_accuracy:.2f}%')
        print(f'  Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

        # Save best model based on validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_metrics = {
                'train_loss': train_losses,
                'val_loss': val_losses,
                'train_accuracy': train_accuracies,
                'val_accuracy': val_accuracies
            }
            print(" Best model updated.")

    return best_metrics


def validate_model(model, val_loader, criterion, device):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for hrv_data, activity_data, labels in val_loader:
            hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device).float()
            outputs = model(hrv_data, activity_data)
            loss = criterion(outputs, labels.unsqueeze(1))

            val_loss += loss.item()
            predicted = torch.sigmoid(outputs).round()
            correct += (predicted == labels.unsqueeze(1)).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total
    return val_loss / len(val_loader), accuracy

from sklearn.model_selection import KFold

def cross_validate_model(model_class, configurations, all_data, k=5, device='cpu'):
    patient_ids = list(all_data.keys())
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    best_model = None
    best_accuracy = 0
    best_config = None

    for train_idx, val_idx in kf.split(patient_ids):
        train_ids = [patient_ids[i] for i in train_idx]  # Convert indices to patient IDs
        val_ids = [patient_ids[i] for i in val_idx]      # Convert indices to patient IDs

        train_data = {pid: all_data[pid] for pid in train_ids}
        val_data = {pid: all_data[pid] for pid in val_ids}

        for config in configurations:
            print(f"Training with configuration: {config}")
            model = model_class(config['num_channels'], use_dropout=config['use_dropout'], dropout_rate=config['dropout_rate'], use_batch_norm=config['use_batch_norm']).to(device)
            
            # Loaders
            train_loader = DataLoader(VariableLengthDataset(train_data), batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn)
            val_loader = DataLoader(VariableLengthDataset(val_data), batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)

            metrics = train_model(model, train_loader, val_loader, device, config)
            current_val_accuracy = metrics['val_accuracy'][-1]

            if current_val_accuracy > best_accuracy:
                best_accuracy = current_val_accuracy
                best_model = model
                best_config = config
                print(f"New best model found with accuracy: {best_accuracy:.2f}")

    return best_model, best_config, best_accuracy

import random
import itertools

def generate_configurations():
    # Define your hyperparameter space
    optimizer_names = ['adam', 'sgd']
    batch_sizes = [16, 32, 64, 128]
    learning_rates = [0.001, 0.01, 0.05]
    num_epochs = [20, 30, 40, 50, 60, 70]
    use_dropout = [True, False]
    dropout_rates = [0.1, 0.2, 0.3, 0.5]
    use_batch_norm = [True, False]
    momenta = [0.8, 0.9]  # Only for 'sgd'
    num_channels = [16, 32, 64, 128]

    # Cartesian product to generate all combinations
    all_combinations = list(itertools.product(optimizer_names, batch_sizes, learning_rates, num_epochs,
                                              use_dropout, dropout_rates, use_batch_norm, momenta, num_channels))

    # Convert tuples to dictionaries
    configurations = [
        {'optimizer_name': combo[0], 'batch_size': combo[1], 'learning_rate': combo[2], 'num_epochs': combo[3],
         'use_dropout': combo[4], 'dropout_rate': combo[5], 'use_batch_norm': combo[6], 'momentum': combo[7], 'num_channels': combo[8]}
        for combo in all_combinations if combo[0] == 'sgd' or combo[7] == None  # Adjusting momentum for optimizers
    ]

    return configurations

def sample_configurations(num_samples):
    configurations = generate_configurations()
    sampled_configurations = random.sample(configurations, min(num_samples, len(configurations)))  # Sample without replacement
    return sampled_configurations

# Example usage
N = 20  # Number of configurations you want
sampled_configurations = sample_configurations(N)
best_model, best_config, best_accuracy = cross_validate_model(DualBranch1DCNN, sampled_configurations, all_data, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

In [None]:
# print the results
print(f"Best accuracy: {best_accuracy:.2f}")
print(f"Best configuration: {best_config}")
