In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split
from torchviz import make_dot

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Seed and data paths
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
DATA_DIRECTORY = 'Data/hyperaktiv_with_controls/hyperaktiv_with_controls/'
VALID_IDs = [1, 3, 5, 7, 9, 11, 15, 19, 20, 21, 22, 23, 24, 27, 31, 32, 33, 34, 35, 36, 37, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 63, 64, 68, 71, 73, 75, 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 93, 94, 95, 97, 98, 101, 104, 105]

# Load demographic data
demographic_data = pd.read_csv(f'{DATA_DIRECTORY}patient_info.csv', sep=';')
demographic_data = demographic_data[demographic_data['ID'].isin(VALID_IDs)]
labels = demographic_data['ADHD'].values

In [None]:
# Plot ADHD class balance
demographic_data['ADHD'].value_counts().plot(kind='bar', title='ADHD class balance in the dataset')
plt.xticks([0, 1], ['Control', 'ADHD'], rotation=0)
plt.ylabel('Number of records')
plt.show()

# Extract control and ADHD IDs
control_ids = demographic_data[demographic_data['ADHD'] == 0]['ID'].values
adhd_ids = demographic_data[demographic_data['ADHD'] == 1]['ID'].values
print(f'Number of control patients: {len(control_ids)}; IDS: {control_ids}')
print(f'Number of ADHD patients: {len(adhd_ids)}; IDS: {adhd_ids}')

In [None]:
# Function for splitting dataset
def split_dataset(ids, labels, train_ratio=0.80, val_ratio=0.10, test_ratio=0.10, random_seed=42):
    splits = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_seed)
    remaining_ratio = 1.0 - test_ratio
    val_relative_ratio = val_ratio / remaining_ratio
    train_val_ids, test_ids = next(splits.split(ids, labels))
    train_val_split = StratifiedShuffleSplit(n_splits=1, test_size=val_relative_ratio, random_state=random_seed)
    train_ids, val_ids = next(train_val_split.split(ids[train_val_ids], labels[train_val_ids]))

    train_ids = ids[train_ids]
    val_ids = ids[val_ids]
    test_ids = ids[test_ids]

    return train_ids, val_ids, test_ids

# Split dataset
train_ids, val_ids, test_ids = split_dataset(np.array(VALID_IDs), np.array(labels))

In [None]:
# Plot class balance in split datasets
def plot_class_balance(data, title):
    data['ADHD'].value_counts().plot(kind='bar', title=title)
    plt.xticks([0, 1], ['Control', 'ADHD'], rotation=0)
    plt.ylabel('Number of records')
    plt.show()

demographic_data_train = demographic_data[demographic_data['ID'].isin(train_ids)]
plot_class_balance(demographic_data_train, 'ADHD class balance in the train dataset')

demographic_data_test = demographic_data[demographic_data['ID'].isin(test_ids)]
plot_class_balance(demographic_data_test, 'ADHD class balance in the test dataset')

demographic_data_val = demographic_data[demographic_data['ID'].isin(val_ids)]
plot_class_balance(demographic_data_val, 'ADHD class balance in the validation dataset')

In [None]:
# Load data
def load_data(sample, demographic_data):
    patients_data = {}
    for patient_id in sample:
        hrv_data = pd.read_csv(f'{DATA_DIRECTORY}/hrv_data/patient_hr_{patient_id}.csv', sep=';')
        activity_data = pd.read_csv(f'{DATA_DIRECTORY}/activity_data/patient_activity_{patient_id}.csv', sep=';')
        labels = demographic_data[demographic_data['ID'] == patient_id]['ADHD'].values[0]

        df_hrv = pd.DataFrame(data=hrv_data).set_index('TIMESTAMP')
        df_activity = pd.DataFrame(data=activity_data).set_index('TIMESTAMP')

        min_length = min(len(df_hrv), len(df_activity))
        df_hrv = df_hrv.iloc[:min_length]
        df_activity = df_activity.iloc[:min_length]

        patients_data[patient_id] = {
            'hrv': df_hrv,
            'activity': df_activity,
            'adhd': labels
        }
    return patients_data

train_data = load_data(train_ids, demographic_data=demographic_data_train)
test_data = load_data(test_ids, demographic_data=demographic_data_test)
val_data = load_data(val_ids, demographic_data=demographic_data_val)

all_data = {
    'train': train_data,
    'val': val_data,
    'test': test_data
}

In [None]:
# Print shapes
def print_shapes(data, label):
    print(f"{label} Patients: ")
    for patient_id, patient_data in data.items():
        print(f'Patient ID: {patient_id}; HRV shape: {patient_data["hrv"].shape}; Activity shape: {patient_data["activity"].shape}')
    print("---------------")

print_shapes(all_data['train'], "Train")
print_shapes(all_data['val'], "Validation")
print_shapes(all_data['test'], "Test")

In [None]:
# Data segmentation and normalization
def segment_data(data, window_size, step_size):
    segments = []
    for start in range(0, len(data) - window_size + 1, step_size):
        segment = data[start:start + window_size]
        segments.append(segment)
    return segments

def normalize_data(data):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data

def preprocessData_create_windows(config, data):
    window_size = config['window_size']
    step_size = window_size // 2
    processed_data = {}

    for patient_id, patient_data in data.items():
        hrv_segments = segment_data(patient_data['hrv']['HRV'], window_size, step_size)
        activity_segments = segment_data(patient_data['activity']['ACTIVITY'], window_size, step_size)
        hrv_normalized = normalize_data(np.array(hrv_segments).reshape(-1, window_size)).reshape(-1, window_size)
        activity_normalized = normalize_data(np.array(activity_segments).reshape(-1, window_size)).reshape(-1, window_size)
        labels_repeated = np.repeat(patient_data['adhd'], len(hrv_normalized))

        processed_data[patient_id] = {'hrv': hrv_normalized, 'activity': activity_normalized, 'labels': labels_repeated}

    return processed_data

In [None]:
# Dataset and model classes
class PatientDataset(Dataset):
    def __init__(self, patients_data):
        self.patients_data = patients_data
        self.patient_ids = list(patients_data.keys())
        self.data = [(patient_id, idx) for patient_id in self.patient_ids for idx in range(len(patients_data[patient_id]['hrv']))]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        patient_id, data_idx = self.data[idx]
        patient_data = self.patients_data[patient_id]

        hrv_data = torch.tensor(patient_data['hrv'][data_idx], dtype=torch.float32)
        activity_data = torch.tensor(patient_data['activity'][data_idx], dtype=torch.float32)
        label = torch.tensor(patient_data['labels'][data_idx], dtype=torch.float32)

        return hrv_data, activity_data, label

class MultimodalADHDNet(nn.Module):
    def __init__(self, output_channels, hidden_size, num_classes, use_batch_norm=True, use_dropout=True, dropout_rate=0.5):
        super(MultimodalADHDNet, self).__init__()
        self.use_batch_norm = use_batch_norm
        self.use_dropout = use_dropout

        # HRV data branch
        self.hrv_conv1 = nn.Conv1d(1, output_channels, kernel_size=3, padding=1)
        if self.use_batch_norm:
            self.hrv_bn1 = nn.BatchNorm1d(output_channels)

        self.hrv_lstm = nn.LSTM(output_channels, hidden_size, batch_first=True)

        # Activity data branch
        self.act_conv1 = nn.Conv1d(1, output_channels, kernel_size=3, padding=1)
        if self.use_batch_norm:
            self.act_bn1 = nn.BatchNorm1d(output_channels)

        self.act_lstm = nn.LSTM(output_channels, hidden_size, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(2 * hidden_size, num_classes)
        if self.use_dropout:
            self.dropout = nn.Dropout(dropout_rate)

    def forward(self, hrv_data, act_data):
        hrv_x = hrv_data.unsqueeze(1)
        act_x = act_data.unsqueeze(1)

        hrv_x = self.hrv_conv1(hrv_x)
        if self.use_batch_norm:
            hrv_x = self.hrv_bn1(hrv_x)
        hrv_x = F.relu(hrv_x)
        hrv_x, _ = self.hrv_lstm(hrv_x.permute(0, 2, 1))
        hrv_x = hrv_x[:, -1, :]

        act_x = self.act_conv1(act_x)
        if self.use_batch_norm:
            act_x = self.act_bn1(act_x)
        act_x = F.relu(act_x)
        act_x, _ = self.act_lstm(act_x.permute(0, 2, 1))
        act_x = act_x[:, -1, :]

        x = torch.cat((hrv_x, act_x), dim=1)
        if self.use_dropout:
            x = self.dropout(x)
        x = self.fc(x)
        return torch.sigmoid(x)

In [None]:
# Configurations
configurations = [
    {'config_id': 1, 'optimizer_name': 'adam', 'output_channels': 10, 'window_size': 50, 'batch_size': 64, 'hidden_size': 64, 'num_classes': 1, 'learning_rate': 0.001, 'num_epochs': 10, 'use_dropout': True, 'dropout_rate': 0.5, 'use_batch_norm': True},
    {'config_id': 3, 'optimizer_name': 'adam', 'output_channels': 32, 'window_size': 70, 'batch_size': 32, 'hidden_size': 32, 'num_classes': 1, 'learning_rate': 0.01, 'num_epochs': 10, 'use_dropout': True, 'dropout_rate': 0.1, 'use_batch_norm': True},
    {'config_id': 4, 'optimizer_name': 'adam', 'output_channels': 64, 'window_size': 80, 'batch_size': 32, 'hidden_size': 32, 'num_classes': 1, 'learning_rate': 0.01, 'num_epochs': 10, 'use_dropout': True, 'dropout_rate': 0.1, 'use_batch_norm': True},
    {'config_id': 6, 'optimizer_name': 'sgd', 'output_channels': 3, 'window_size': 10, 'batch_size': 16, 'hidden_size': 32, 'num_classes': 1, 'learning_rate': 0.001, 'num_epochs': 10, 'use_dropout': True, 'dropout_rate': 0.5, 'use_batch_norm': True, 'momentum': 0.9},
    {'config_id': 13, 'optimizer_name': 'sgd', 'output_channels': 5, 'window_size': 20, 'batch_size': 32, 'hidden_size': 64, 'num_classes': 1, 'learning_rate': 0.01, 'num_epochs': 10, 'use_dropout': True, 'dropout_rate': 0.3, 'use_batch_norm': True, 'momentum': 0.8},
]

In [None]:
# Metrics calculation
def calculate_metrics(outputs, labels):
    predicted = outputs.detach().round()
    accuracy = accuracy_score(labels.cpu().numpy(), predicted.cpu().numpy())
    precision = precision_score(labels.cpu().numpy(), predicted.cpu().numpy(), zero_division=0)
    recall = recall_score(labels.cpu().numpy(), predicted.cpu().numpy(), zero_division=0)
    f1 = f1_score(labels.cpu().numpy(), predicted.cpu().numpy(), zero_division=0)
    return accuracy, precision, recall, f1

In [None]:
# Training and validation
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    total_accuracy, total_precision, total_recall, total_f1 = 0, 0, 0, 0
    total_batches = 0

    for hrv_data, activity_data, labels in data_loader:
        hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(hrv_data, activity_data).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * labels.size(0)
        accuracy, precision, recall, f1 = calculate_metrics(outputs, labels)
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_batches += 1

    average_loss = running_loss / len(data_loader.dataset)
    average_accuracy = total_accuracy / total_batches
    average_precision = total_precision / total_batches
    average_recall = total_recall / total_batches
    average_f1 = total_f1 / total_batches

    return average_loss, average_accuracy, average_precision, average_recall, average_f1

def validate_epoch(model, data_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    total_accuracy, total_precision, total_recall, total_f1 = 0, 0, 0, 0
    total_batches = 0

    with torch.no_grad():
        for hrv_data, activity_data, labels in data_loader:
            hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device)
            outputs = model(hrv_data, activity_data).squeeze(1)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * labels.size(0)
            accuracy, precision, recall, f1 = calculate_metrics(outputs, labels)
            total_accuracy += accuracy
            total_precision += precision
            total_recall += recall
            total_f1 += f1
            total_batches += 1

    average_loss = running_loss / len(data_loader.dataset)
    average_accuracy = total_accuracy / total_batches
    average_precision = total_precision / total_batches
    average_recall = total_recall / total_batches
    average_f1 = total_f1 / total_batches

    return average_loss, average_accuracy, average_precision, average_recall, average_f1

In [None]:
# Testing
def test_model(model, data_loader, device):
    model.eval()
    all_outputs = []
    all_labels = []

    with torch.no_grad():
        for hrv_data, activity_data, labels in data_loader:
            hrv_data, activity_data, labels = hrv_data.to(device), activity_data.to(device), labels.to(device)
            outputs = model(hrv_data, activity_data)
            all_outputs.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    accuracy, precision, recall, f1 = calculate_metrics(torch.tensor(all_outputs), torch.tensor(all_labels))

    try:
        roc_auc = roc_auc_score(all_labels, all_outputs)
    except ValueError:
        roc_auc = float('nan')

    return accuracy, precision, recall, f1, roc_auc, all_outputs, all_labels

In [None]:
# Experiment runner
def run_experiment(config, all_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalADHDNet(
        output_channels=config['output_channels'],
        hidden_size=config['hidden_size'],
        num_classes=config['num_classes'],
        use_batch_norm=config['use_batch_norm'],
        use_dropout=config['use_dropout'],
        dropout_rate=0.5
    ).to(device)

    if config['optimizer_name'] == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    elif config['optimizer_name'] == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'])

    criterion = nn.BCELoss()

    train_data = preprocessData_create_windows(config, all_data['train'])
    val_data = preprocessData_create_windows(config, all_data['val'])
    test_data = preprocessData_create_windows(config, all_data['test'])

    train_loader = DataLoader(PatientDataset(train_data), batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(PatientDataset(val_data), batch_size=config['batch_size'], shuffle=False)
    test_loader = DataLoader(PatientDataset(test_data), batch_size=config['batch_size'], shuffle=False)

    final_train_metrics = {}
    final_val_metrics = {}
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

    for epoch in range(config['num_epochs']):
        train_loss, train_acc, train_prec, train_rec, train_f1 = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc, val_prec, val_rec, val_f1 = validate_epoch(model, val_loader, criterion, device)

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        if epoch == config['num_epochs'] - 1:
            final_train_metrics = {
                'Train Loss': round(train_loss, 2),
                'Train Accuracy': round(train_acc, 2),
                'Train Precision': round(train_prec, 2),
                'Train Recall': round(train_rec, 2),
                'Train F1': round(train_f1, 2)
            }
            final_val_metrics = {
                'Val Loss': round(val_loss, 2),
                'Val Accuracy': round(val_acc, 2),
                'Val Precision': round(val_prec, 2),
                'Val Recall': round(val_rec, 2),
                'Val F1': round(val_f1, 2)
            }

    test_accuracy, test_precision, test_recall, test_f1, test_roc_auc, test_outputs, test_labels = test_model(model, test_loader, device)
    test_metrics = {
        'Test Accuracy': round(test_accuracy, 2),
        'Test Precision': round(test_precision, 2),
        'Test Recall': round(test_recall, 2),
        'Test F1': round(test_f1, 2),
        'Test ROC-AUC': round(test_roc_auc, 2)
    }

    return history, {**final_train_metrics, **final_val_metrics, **test_metrics}, model, test_outputs, test_labels

In [None]:
# Experiment execution and results
results = []
config_histories = {}
best_val_acc = 0.0
best_model = None
best_config = None
best_history = None
best_test_outputs = None
best_test_labels = None

for config in configurations:
    print(f"Configuration: {config}")
    history, test_results, model, test_outputs, test_labels = run_experiment(config, all_data)
    config_histories[config['config_id']] = history
    results.append({'config': config, **test_results})

    val_acc = max(history['val_acc'])
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model
        best_config = config
        best_history = history
        best_test_outputs = test_outputs
        best_test_labels = test_labels

In [None]:
# Print and plot best results
print(f"Best Configuration: {best_config}")
print(f"Test Metrics:\n")
for key, value in test_results.items():
    if key != 'config':
        print(f"{key}: {value}")

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(best_history['train_loss'], label='Train Loss')
plt.plot(best_history['val_loss'], label='Validation Loss')
plt.title(f'Best Config {best_config["config_id"]} - Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(best_history['train_acc'], label='Train Accuracy')
plt.plot(best_history['val_acc'], label='Validation Accuracy')
plt.title(f'Best Config {best_config["config_id"]} - Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
# ROC-AUC curve
fpr, tpr, thresholds = roc_curve(best_test_labels, best_test_outputs)
roc_auc = roc_auc_score(best_test_labels, best_test_outputs)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Print model summary
print(best_model)

In [None]:
# Save results
results_df = pd.DataFrame(results)
results_df.to_csv('results_batch05_best_tunned_windows.csv', index=False)

In [None]:
# Best model summary
best_model_summary = {
    "Model Configuration": best_config,
    "Test Metrics": {
        "Test Accuracy": test_results['Test Accuracy'],
        "Test Precision": test_results['Test Precision'],
        "Test Recall": test_results['Test Recall'],
        "Test F1": test_results['Test F1'],
        "Test ROC-AUC": test_results['Test ROC-AUC']
    }
}

pd.set_option('display.max_columns', None)
print(pd.DataFrame(best_model_summary))

In [None]:
# Configuration scores
print("\nAll Configurations' Scores:\n")
results_summary_df = results_df.drop(columns=['config'])
print(results_summary_df)

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(best_test_labels, (best_test_outputs > 0.5).astype(int))
ConfusionMatrixDisplay(confusion_matrix=conf_matrix).plot()
plt.title('Confusion Matrix for Best Model')
plt.show()