In [28]:
import pandas as pd
import numpy as np

import wandb
wandb.init(project='gene')

df = pd.read_csv('data/Oligo_NN.RNA_DEG.csv')
df.set_index('gene', inplace=True)
df.head()

# non_zero_genes = df[df['DEG'] != 0].index

# df = df[df.index.isin(non_zero_genes)]
gene2value = df[['DEG']]

MCG_FEATURE_NAMES = ['2mo', '9mo', '18mo', '9mo-2mo', '18mo-9mo', '9mo/2mo', '18mo/9mo', 'old-young', 'old/young', 'distance']
ATAC_FEATURE_NAMES = ['log2(old/young)', 'distance']


In [29]:
# Process mcg data
mcg = pd.read_csv('data/Oligo_NN.aDMR_gene.csv')
mcg_feat = mcg
mcg_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
mcg_feat['distance'] = (mcg_feat['gene_start'] - mcg_feat['start']).abs().astype(np.float64)
mcg_feat['old/young'] = mcg_feat['18mo'] / mcg_feat['2mo']
mcg_feat['9mo-2mo'] = mcg_feat['9mo'] - mcg_feat['2mo']
mcg_feat['18mo-9mo'] = mcg_feat['18mo'] - mcg_feat['9mo']
mcg_feat['9mo/2mo'] = mcg_feat['9mo'] / mcg_feat['2mo']
mcg_feat['18mo/9mo'] = mcg_feat['18mo'] / mcg_feat['9mo']
mcg_feat.fillna(0, inplace=True)
mcg_feat = mcg_feat[['gene', *MCG_FEATURE_NAMES]]
print(mcg_feat.head())


    gene   2mo   9mo  18mo  9mo-2mo  18mo-9mo   9mo/2mo  18mo/9mo  old-young  \
0  Rgs20  0.65  0.60  0.90    -0.05      0.30  0.923077  1.500000       0.25   
1  Sulf1  0.36  0.52  0.56     0.16      0.04  1.444444  1.076923       0.20   
2  Sulf1  0.43  0.59  0.64     0.16      0.05  1.372093  1.084746       0.21   
3   Eya1  0.68  0.62  0.47    -0.06     -0.15  0.911765  0.758065      -0.21   
4   Eya1  0.61  0.37  0.45    -0.24      0.08  0.606557  1.216216      -0.16   

   old/young  distance  
0   1.384615  151241.0  
1   1.555556  121205.0  
2   1.488372  170142.0  
3   0.691176  137980.0  
4   0.737705  138254.0  


In [30]:
atac = pd.read_csv('data/Oligo_NN.ATAC_gene.csv')
atac_feat = atac
atac_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
atac_feat['distance'] = (atac_feat['gene_start'] - atac_feat['start']).abs().astype(np.float64)
atac_feat = atac_feat[['gene', *ATAC_FEATURE_NAMES]]
print(atac_feat.head())

      gene  log2(old/young)  distance
0    Itgb5         1.521456   32221.0
1   Begain         1.320315   37592.0
2   BEGAIN         1.320315   36795.0
3     Eya4         1.588159  247781.0
4  Gm27247         1.018367   97190.0


In [31]:
print(f'Total genes: {len(gene2value)}', f'unique genes: {len(set(gene2value.index))}')
print(f'genes: {gene2value.index}')

Total genes: 17783 unique genes: 17783
genes: Index(['Xkr4', 'Gm1992', 'Gm19938', 'Mrpl15', 'Lypla1', 'Tcea1', 'Rgs20',
       'Atp6v1h', 'Oprk1', 'Rb1cc1',
       ...
       'Smim23', 'Gm31126', 'Gm13546', 'Gm16350', 'Gm26784', 'Gm34654',
       'Gm29825', 'AI463170', 'E330034G19Rik', 'B130011K05Rik'],
      dtype='object', name='gene', length=17783)


In [32]:
# mcg_mean = mcg_feat.groupby('gene').mean()
# # Sort mcg_mean by gene name
# mcg_mean = mcg_mean.loc[gene2value.index]
# atac_mean = atac_feat.groupby('gene').mean()
# atac_mean = atac_mean.loc[gene2value.index]

In [33]:
# print('mcg corr:', mcg_mean.corrwith(gene2value['DEG']))
# print('atac corr:', atac_mean.corrwith(gene2value['DEG']))
index_order = gene2value.index.tolist()

In [34]:
# Train a sequence model on mcg_feat to predict gene2value['log2(old/young)']
# Each gene has a sequence of 4 features, 2mo, 9mo, 18mo, old-young
# The sequence length is not fixed, so we need to use a dynamic model
# Let's use a commonly used sequence prediction model for sentence classification
# like LSTM or Transformer

# Step 1: Prepare the data
list_mcg_feat = mcg_feat.groupby('gene').apply(lambda x: x[MCG_FEATURE_NAMES].values.tolist())
list_mcg_feat = list_mcg_feat.reindex(index_order, fill_value=[[0] * len(MCG_FEATURE_NAMES)])
x_mcg = list_mcg_feat.values.tolist()

list_atac_feat = atac_feat.groupby('gene').apply(lambda x: x[ATAC_FEATURE_NAMES].values.tolist())
list_atac_feat = list_atac_feat.reindex(index_order, fill_value=[[0] * len(ATAC_FEATURE_NAMES)])
x_atac = list_atac_feat.values.tolist()
print(x_mcg[:10])


  list_mcg_feat = mcg_feat.groupby('gene').apply(lambda x: x[MCG_FEATURE_NAMES].values.tolist())


[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0.65, 0.6, 0.9, -0.050000000000000044, 0.30000000000000004, 0.923076923076923, 1.5, 0.25, 1.3846153846153846, 151241.0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]


  list_atac_feat = atac_feat.groupby('gene').apply(lambda x: x[ATAC_FEATURE_NAMES].values.tolist())


In [35]:
y = gene2value.loc[list_mcg_feat.index]['DEG'].values.tolist()
y = np.array([int(i) for i in y])
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
# Step 2: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
import numpy as np

# set random seed
np.random.seed(25)

# Separate the data into zero and non-zero y values
zero_indices = np.where(y == 0)[0]
non_zero_indices = np.where(y != 0)[0]
print(f'zero: {len(zero_indices)}, non-zero: {len(non_zero_indices)}')

# Sample len(non_zero_indices) indices from each group
n_samples = len(non_zero_indices)
sampled_zero_indices = np.random.choice(zero_indices, n_samples // 2, replace=False)
sampled_non_zero_indices = np.random.choice(non_zero_indices, n_samples, replace=False)

# Combine the sampled indices
sampled_indices = np.concatenate([sampled_zero_indices, sampled_non_zero_indices])

# Create balanced dataset
X_balanced = {}
X_balanced['mcg'] = [x_mcg[i] for i in sampled_indices]
X_balanced['atac'] = [x_atac[i] for i in sampled_indices]
y_balanced = y[sampled_indices]

zero: 16703, non-zero: 1080


In [49]:
# Normalization function
def normalize_features(train_data, test_data):
    # Flatten the lists for easier processing
    train_flat = [item for sublist in train_data for item in sublist]
    test_flat = [item for sublist in test_data for item in sublist]
    feature_dim = len(train_flat[0])
    # Separate features
    train_other_features = np.array([item[:feature_dim-1] for item in train_flat])
    train_distances = np.array([item[feature_dim-1] for item in train_flat])
    test_other_features = np.array([item[:feature_dim-1] for item in test_flat])
    test_distances = np.array([item[feature_dim-1] for item in test_flat])
    
    # Normalize other features using min-max scaling based on train data
    min_vals = np.min(train_other_features, axis=0)
    max_vals = np.max(train_other_features, axis=0)
    train_normalized_features = (train_other_features - min_vals) / (max_vals - min_vals)
    test_normalized_features = (test_other_features - min_vals) / (max_vals - min_vals)
    
    # Normalize distances using log transformation and then min-max scaling based on train data
    train_log_distances = np.log1p(train_distances)
    test_log_distances = np.log1p(test_distances)
    min_dist = np.min(train_log_distances)
    max_dist = np.max(train_log_distances)
    train_normalized_distances = (train_log_distances - min_dist) / (max_dist - min_dist)
    test_normalized_distances = (test_log_distances - min_dist) / (max_dist - min_dist)
    
    # Combine normalized features and distances
    def reconstruct_data(features, distances, original_data):
        normalized_data = []
        idx = 0
        for sublist in original_data:
            normalized_sublist = []
            for _ in sublist:
                normalized_sublist.append(list(features[idx][:feature_dim-1]) + [distances[idx]])
                idx += 1
            normalized_data.append(normalized_sublist)
        return normalized_data
    
    train_normalized = reconstruct_data(train_normalized_features, train_normalized_distances, train_data)
    test_normalized = reconstruct_data(test_normalized_features, test_normalized_distances, test_data)
    
    return train_normalized, test_normalized




In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import KFold
# Set random seed
torch.manual_seed(25)

HIDDEN_DIM = 16
NUM_LAYERS = 2
NUM_HEADS = 1
DROPOUT = 0.2
LR = 0.001
OUTPUT_DIM = 3  # number of classes (-1, 0, 1)


# Define the attention-based model
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, num_heads=1, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layers = nn.TransformerEncoderLayer(hidden_dim, num_heads, dim_feedforward=hidden_dim*2, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.classifier = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.transformer_encoder(x, src_key_padding_mask=~mask.bool())
        
        # Global average pooling
        x = x.mean(dim=1)
        
        output = self.classifier(x)
        return output, None  # Return None for attention weights as they're not directly accessible

# Custom dataset
class GeneDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        gene_data = torch.FloatTensor(self.data[idx])
        label = torch.LongTensor([self.labels[idx] + 1])  # Add 1 to shift labels to 0, 1, 2
        mask = torch.ones(len(gene_data))
        return gene_data, label, mask

# Collate function for DataLoader
def collate_fn(batch):
    # Sort the batch by sequence length (descending)
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, labels, masks = zip(*batch)
    
    # Get lengths of each sequence
    lengths = [len(seq) for seq in sequences]
    max_len = max(lengths)
    
    # Pad sequences
    padded_seqs = torch.zeros(len(sequences), max_len, sequences[0].size(1))
    padded_masks = torch.zeros(len(sequences), max_len)
    
    for i, (seq, length) in enumerate(zip(sequences, lengths)):
        padded_seqs[i, :length] = seq
        padded_masks[i, :length] = 1
    
    return padded_seqs, torch.cat(labels), padded_masks

def train_model(X_train_normalized, y_train_raw, X_test_normalized, y_test_raw):
    input_dim = len(X_train_normalized[0][0])
    # Create datasets and dataloaders
    train_dataset = GeneDataset(X_train_normalized, y_train_raw)
    test_dataset = GeneDataset(X_test_normalized, y_test_raw)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Initialize the model
    model = TransformerModel(input_dim, HIDDEN_DIM, OUTPUT_DIM, num_layers=NUM_LAYERS, num_heads=NUM_HEADS, dropout=DROPOUT)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)


    # Training loop
    num_epochs = 50
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.0001)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_correct = 0
        train_total = 0
        for batch_x, batch_y, batch_mask in train_loader:
            optimizer.zero_grad()
            outputs, _ = model(batch_x, batch_mask)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            train_correct += (outputs.argmax(dim=1) == batch_y.squeeze()).sum().item()
            train_total += batch_y.size(0)
        # scheduler.step()

        # scheduler.step()
        # Evaluation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_x, batch_y, batch_mask in test_loader:
                outputs, _ = model(batch_x, batch_mask)
                _, predicted = torch.max(outputs.data, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y.squeeze()).sum().item()
        
        accuracy = correct / total
        # print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_correct/train_total:.4f}, Test Accuracy: {accuracy:.4f}')

    # Final evaluation
    model.eval()
    all_attention_weights = []
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y, batch_mask in test_loader:
            outputs, _ = model(batch_x, batch_mask)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    # Print final accuracy
    final_accuracy = sum(np.array(all_predictions) == np.array(all_labels).squeeze()) / len(all_labels)
    print(f'Final Test Accuracy: {final_accuracy:.4f}')
    return final_accuracy


In [51]:
from tqdm import tqdm

kf = KFold(n_splits=5, shuffle=True, random_state=25)
accuracies = []
for train_index, test_index in tqdm(kf.split(X_balanced['mcg']), total=5):
    X_train, X_test = [X_balanced['mcg'][i] for i in train_index], [X_balanced['mcg'][i] for i in test_index]
    y_train, y_test = [y_balanced[i] for i in train_index], [y_balanced[i] for i in test_index]
    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
    accuracies.append(train_model(X_train_normalized, y_train, X_test_normalized, y_test))
print(f'Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')


 20%|██        | 1/5 [00:20<01:23, 20.91s/it]

Final Test Accuracy: 0.5247


 40%|████      | 2/5 [00:39<00:59, 19.78s/it]

Final Test Accuracy: 0.5062


 60%|██████    | 3/5 [00:59<00:39, 19.58s/it]

Final Test Accuracy: 0.5062


 80%|████████  | 4/5 [01:19<00:19, 19.71s/it]

Final Test Accuracy: 0.5309


100%|██████████| 5/5 [01:38<00:00, 19.73s/it]

Final Test Accuracy: 0.4630
Mean Accuracy: 0.5062 ± 0.0237





In [52]:
from tqdm import tqdm

kf = KFold(n_splits=5, shuffle=True, random_state=25)
accuracies = []
for train_index, test_index in tqdm(kf.split(X_balanced['atac']), total=5):
    X_train, X_test = [X_balanced['atac'][i] for i in train_index], [X_balanced['atac'][i] for i in test_index]
    y_train, y_test = [y_balanced[i] for i in train_index], [y_balanced[i] for i in test_index]
    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
    accuracies.append(train_model(X_train_normalized, y_train, X_test_normalized, y_test))
print(f'Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')


 20%|██        | 1/5 [00:19<01:16, 19.07s/it]

Final Test Accuracy: 0.5617


 40%|████      | 2/5 [00:38<00:57, 19.24s/it]

Final Test Accuracy: 0.5340


 60%|██████    | 3/5 [00:57<00:38, 19.31s/it]

Final Test Accuracy: 0.5432


 80%|████████  | 4/5 [01:17<00:19, 19.47s/it]

Final Test Accuracy: 0.5617


100%|██████████| 5/5 [01:36<00:00, 19.24s/it]

Final Test Accuracy: 0.4907
Mean Accuracy: 0.5383 ± 0.0261





In [55]:


class TwoHeadTransformerModel(nn.Module):
    def __init__(self, mcg_input_dim, atac_input_dim, hidden_dim, output_dim, num_layers=2, num_heads=1, dropout=0.1):
        super(TwoHeadTransformerModel, self).__init__()
        self.mcg_embedding = nn.Linear(mcg_input_dim, hidden_dim)
        self.atac_embedding = nn.Linear(atac_input_dim, hidden_dim)
        
        encoder_layers = nn.TransformerEncoderLayer(hidden_dim, num_heads, dim_feedforward=hidden_dim*2, dropout=dropout, batch_first=True)
        self.mcg_transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.atac_transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        
        self.classifier = nn.Linear(hidden_dim * 2, output_dim)
        self.hidden_dim = hidden_dim

    def forward(self, mcg_x, mcg_mask, atac_x, atac_mask):
        mcg_x = self.mcg_embedding(mcg_x)
        atac_x = self.atac_embedding(atac_x)
        
        mcg_x = self.mcg_transformer(mcg_x, src_key_padding_mask=~mcg_mask.bool())
        atac_x = self.atac_transformer(atac_x, src_key_padding_mask=~atac_mask.bool())
        
        # Global average pooling
        mcg_x = mcg_x.mean(dim=1)
        atac_x = atac_x.mean(dim=1)
        
        # Concatenate MCG and ATAC embeddings
        combined_x = torch.cat((mcg_x, atac_x), dim=1)
        
        output = self.classifier(combined_x)
        return output

class CombinedGeneDataset(Dataset):
    def __init__(self, mcg_data, atac_data, labels):
        self.mcg_data = mcg_data
        self.atac_data = atac_data
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        mcg_gene_data = torch.FloatTensor(self.mcg_data[idx])
        atac_gene_data = torch.FloatTensor(self.atac_data[idx])
        label = torch.LongTensor([self.labels[idx] + 1])  # Add 1 to shift labels to 0, 1, 2
        mcg_mask = torch.ones(len(mcg_gene_data))
        atac_mask = torch.ones(len(atac_gene_data))
        return mcg_gene_data, atac_gene_data, label, mcg_mask, atac_mask

def combined_collate_fn(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    mcg_sequences, atac_sequences, labels, mcg_masks, atac_masks = zip(*batch)
    
    mcg_lengths = [len(seq) for seq in mcg_sequences]
    atac_lengths = [len(seq) for seq in atac_sequences]
    mcg_max_len = max(mcg_lengths)
    atac_max_len = max(atac_lengths)
    
    padded_mcg_seqs = torch.zeros(len(mcg_sequences), mcg_max_len, mcg_sequences[0].size(1))
    padded_atac_seqs = torch.zeros(len(atac_sequences), atac_max_len, atac_sequences[0].size(1))
    padded_mcg_masks = torch.zeros(len(mcg_sequences), mcg_max_len)
    padded_atac_masks = torch.zeros(len(atac_sequences), atac_max_len)
    
    for i, (mcg_seq, atac_seq, mcg_length, atac_length) in enumerate(zip(mcg_sequences, atac_sequences, mcg_lengths, atac_lengths)):
        padded_mcg_seqs[i, :mcg_length] = mcg_seq
        padded_atac_seqs[i, :atac_length] = atac_seq
        padded_mcg_masks[i, :mcg_length] = 1
        padded_atac_masks[i, :atac_length] = 1
    
    return padded_mcg_seqs, padded_atac_seqs, torch.cat(labels), padded_mcg_masks, padded_atac_masks

def train_combined_model(X_train_mcg, X_train_atac, y_train, X_test_mcg, X_test_atac, y_test):
    wandb.init(project='gene')
    mcg_input_dim = len(X_train_mcg[0][0])
    atac_input_dim = len(X_train_atac[0][0])
    
    train_dataset = CombinedGeneDataset(X_train_mcg, X_train_atac, y_train)
    test_dataset = CombinedGeneDataset(X_test_mcg, X_test_atac, y_test)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=combined_collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=combined_collate_fn)

    model = TwoHeadTransformerModel(mcg_input_dim, atac_input_dim, HIDDEN_DIM, OUTPUT_DIM, num_layers=NUM_LAYERS, num_heads=NUM_HEADS, dropout=DROPOUT)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)

    num_epochs = 50
    lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.0001)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_correct = 0
        train_total = 0
        for mcg_x, atac_x, batch_y, mcg_mask, atac_mask in train_loader:
            optimizer.zero_grad()
            outputs = model(mcg_x, mcg_mask, atac_x, atac_mask)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()
            train_correct += (outputs.argmax(dim=1) == batch_y.squeeze()).sum().item()
            train_total += batch_y.size(0)

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for mcg_x, atac_x, batch_y, mcg_mask, atac_mask in test_loader:
                outputs = model(mcg_x, mcg_mask, atac_x, atac_mask)
                _, predicted = torch.max(outputs.data, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y.squeeze()).sum().item()
        
        accuracy = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_correct/train_total:.4f}, Test Accuracy: {accuracy:.4f}')
        wandb.log({'train_loss': total_loss/len(train_loader), 'train_accuracy': train_correct/train_total, 'test_accuracy': accuracy})

    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for mcg_x, atac_x, batch_y, mcg_mask, atac_mask in test_loader:
            outputs = model(mcg_x, mcg_mask, atac_x, atac_mask)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    final_accuracy = sum(np.array(all_predictions) == np.array(all_labels).squeeze()) / len(all_labels)
    print(f'Final Test Accuracy: {final_accuracy:.4f}')
    return final_accuracy

In [56]:

kf = KFold(n_splits=5, shuffle=True, random_state=25)
accuracies = []
for train_index, test_index in tqdm(kf.split(X_balanced['mcg']), total=5):
    X_train_mcg, X_test_mcg = [X_balanced['mcg'][i] for i in train_index], [X_balanced['mcg'][i] for i in test_index]
    X_train_atac, X_test_atac = [X_balanced['atac'][i] for i in train_index], [X_balanced['atac'][i] for i in test_index]
    y_train, y_test = [y_balanced[i] for i in train_index], [y_balanced[i] for i in test_index]
    
    X_train_mcg_normalized, X_test_mcg_normalized = normalize_features(X_train_mcg, X_test_mcg)
    X_train_atac_normalized, X_test_atac_normalized = normalize_features(X_train_atac, X_test_atac)
    
    accuracies.append(train_combined_model(X_train_mcg_normalized, X_train_atac_normalized, y_train, 
                                           X_test_mcg_normalized, X_test_atac_normalized, y_test))

print(f'Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch [1/50], Train Loss: 1.0521, Train Accuracy: 0.4421, Test Accuracy: 0.5216
Epoch [2/50], Train Loss: 1.0077, Train Accuracy: 0.4861, Test Accuracy: 0.5278
Epoch [3/50], Train Loss: 1.0000, Train Accuracy: 0.4900, Test Accuracy: 0.5309
Epoch [4/50], Train Loss: 0.9922, Train Accuracy: 0.4946, Test Accuracy: 0.5370
Epoch [5/50], Train Loss: 0.9863, Train Accuracy: 0.5023, Test Accuracy: 0.5000
Epoch [6/50], Train Loss: 0.9902, Train Accuracy: 0.4969, Test Accuracy: 0.5463
Epoch [7/50], Train Loss: 0.9779, Train Accuracy: 0.5147, Test Accuracy: 0.5401
Epoch [8/50], Train Loss: 0.9692, Train Accuracy: 0.5224, Test Accuracy: 0.5525
Epoch [9/50], Train Loss: 0.9632, Train Accuracy: 0.5162, Test Accuracy: 0.5525
Epoch [10/50], Train Loss: 0.9645, Train Accuracy: 0.5131, Test Accuracy: 0.5525
Epoch [11/50], Train Loss: 0.9620, Train Accuracy: 0.5170, Test Accuracy: 0.5463
Epoch [12/50], Train Loss: 0.9691, Train Accuracy: 0.5208, Test Accuracy: 0.5586
Epoch [13/50], Train Loss: 0.9579, Tr

 20%|██        | 1/5 [00:36<02:27, 36.87s/it]

Epoch [50/50], Train Loss: 0.9318, Train Accuracy: 0.5463, Test Accuracy: 0.5772
Final Test Accuracy: 0.5772
Epoch [1/50], Train Loss: 1.0685, Train Accuracy: 0.4313, Test Accuracy: 0.4846
Epoch [2/50], Train Loss: 1.0088, Train Accuracy: 0.4815, Test Accuracy: 0.4784
Epoch [3/50], Train Loss: 0.9925, Train Accuracy: 0.4869, Test Accuracy: 0.5123
Epoch [4/50], Train Loss: 0.9881, Train Accuracy: 0.4923, Test Accuracy: 0.5123
Epoch [5/50], Train Loss: 0.9924, Train Accuracy: 0.4923, Test Accuracy: 0.5154
Epoch [6/50], Train Loss: 0.9875, Train Accuracy: 0.4846, Test Accuracy: 0.5093
Epoch [7/50], Train Loss: 0.9823, Train Accuracy: 0.5023, Test Accuracy: 0.5185
Epoch [8/50], Train Loss: 0.9835, Train Accuracy: 0.5015, Test Accuracy: 0.5216
Epoch [9/50], Train Loss: 0.9690, Train Accuracy: 0.5162, Test Accuracy: 0.5154
Epoch [10/50], Train Loss: 0.9667, Train Accuracy: 0.5069, Test Accuracy: 0.5370
Epoch [11/50], Train Loss: 0.9704, Train Accuracy: 0.5100, Test Accuracy: 0.5062
Epoch [12

 20%|██        | 1/5 [00:54<03:36, 54.14s/it]


KeyboardInterrupt: 