In [1]:
import pandas as pd
import numpy as np
from data_loader import load_data, get_balanced_data, normalize_features

import wandb
wandb.init(project='gene')

data = load_data()
X_balanced, y_balanced = get_balanced_data(data)


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmichaelvll[0m. Use [1m`wandb login --relogin`[0m to force relogin


    gene   2mo   9mo  18mo  9mo-2mo  18mo-9mo   9mo/2mo  18mo/9mo  old-young  \
0  Rgs20  0.65  0.60  0.90    -0.05      0.30  0.923077  1.500000       0.25   
1  Sulf1  0.36  0.52  0.56     0.16      0.04  1.444444  1.076923       0.20   
2  Sulf1  0.43  0.59  0.64     0.16      0.05  1.372093  1.084746       0.21   
3   Eya1  0.68  0.62  0.47    -0.06     -0.15  0.911765  0.758065      -0.21   
4   Eya1  0.61  0.37  0.45    -0.24      0.08  0.606557  1.216216      -0.16   

   old/young  distance  
0   1.384615  151241.0  
1   1.555556  121205.0  
2   1.488372  170142.0  
3   0.691176  137980.0  
4   0.737705  138254.0  
      gene  log2(old/young)  distance
0    Itgb5         1.521456   32221.0
1   Begain         1.320315   37592.0
2   BEGAIN         1.320315   36795.0
3     Eya4         1.588159  247781.0
4  Gm27247         1.018367   97190.0
[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0,

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import KFold
# Set random seed
torch.manual_seed(25)

HIDDEN_DIM = 16
NUM_LAYERS = 2
NUM_HEADS = 1
DROPOUT = 0.2
LR = 0.001
OUTPUT_DIM = 3  # number of classes (-1, 0, 1)


# Define the attention-based model
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, num_heads=1, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layers = nn.TransformerEncoderLayer(hidden_dim, num_heads, dim_feedforward=hidden_dim*2, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.classifier = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.transformer_encoder(x, src_key_padding_mask=~mask.bool())
        
        # Global average pooling
        x = x.mean(dim=1)
        
        output = self.classifier(x)
        return output, None  # Return None for attention weights as they're not directly accessible

# Custom dataset
class GeneDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        gene_data = torch.FloatTensor(self.data[idx])
        label = torch.LongTensor([self.labels[idx] + 1])  # Add 1 to shift labels to 0, 1, 2
        mask = torch.ones(len(gene_data))
        return gene_data, label, mask

# Collate function for DataLoader
def collate_fn(batch):
    # Sort the batch by sequence length (descending)
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, labels, masks = zip(*batch)
    
    # Get lengths of each sequence
    lengths = [len(seq) for seq in sequences]
    max_len = max(lengths)
    
    # Pad sequences
    padded_seqs = torch.zeros(len(sequences), max_len, sequences[0].size(1))
    padded_masks = torch.zeros(len(sequences), max_len)
    
    for i, (seq, length) in enumerate(zip(sequences, lengths)):
        padded_seqs[i, :length] = seq
        padded_masks[i, :length] = 1
    
    return padded_seqs, torch.cat(labels), padded_masks

def train_model(X_train_normalized, y_train_raw, X_test_normalized, y_test_raw):
    input_dim = len(X_train_normalized[0][0])
    # Create datasets and dataloaders
    train_dataset = GeneDataset(X_train_normalized, y_train_raw)
    test_dataset = GeneDataset(X_test_normalized, y_test_raw)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Initialize the model
    model = TransformerModel(input_dim, HIDDEN_DIM, OUTPUT_DIM, num_layers=NUM_LAYERS, num_heads=NUM_HEADS, dropout=DROPOUT)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)


    # Training loop
    num_epochs = 50
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.0001)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_correct = 0
        train_total = 0
        for batch_x, batch_y, batch_mask in train_loader:
            optimizer.zero_grad()
            outputs, _ = model(batch_x, batch_mask)
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            train_correct += (outputs.argmax(dim=1) == batch_y.squeeze()).sum().item()
            train_total += batch_y.size(0)
        # scheduler.step()

        # scheduler.step()
        # Evaluation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_x, batch_y, batch_mask in test_loader:
                outputs, _ = model(batch_x, batch_mask)
                _, predicted = torch.max(outputs.data, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y.squeeze()).sum().item()
        
        accuracy = correct / total
        # print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_correct/train_total:.4f}, Test Accuracy: {accuracy:.4f}')
        wandb.log({'train_loss': total_loss/len(train_loader), 'train_accuracy': train_correct/train_total, 'test_accuracy': accuracy})

    # Final evaluation
    model.eval()
    all_attention_weights = []
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y, batch_mask in test_loader:
            outputs, _ = model(batch_x, batch_mask)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    # Print final accuracy
    final_accuracy = sum(np.array(all_predictions) == np.array(all_labels).squeeze()) / len(all_labels)
    print(f'Final Test Accuracy: {final_accuracy:.4f}')
    return final_accuracy


In [3]:
from tqdm import tqdm

kf = KFold(n_splits=5, shuffle=True, random_state=25)
accuracies = []
for train_index, test_index in tqdm(kf.split(X_balanced['mcg']), total=5):
    X_train, X_test = [X_balanced['mcg'][i] for i in train_index], [X_balanced['mcg'][i] for i in test_index]
    y_train, y_test = [y_balanced[i] for i in train_index], [y_balanced[i] for i in test_index]
    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
    accuracies.append(train_model(X_train_normalized, y_train, X_test_normalized, y_test))
print(f'Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')


 20%|██        | 1/5 [00:21<01:24, 21.04s/it]

Final Test Accuracy: 0.5525


 40%|████      | 2/5 [00:42<01:03, 21.04s/it]

Final Test Accuracy: 0.5000


 60%|██████    | 3/5 [01:04<00:43, 21.70s/it]

Final Test Accuracy: 0.5031


 80%|████████  | 4/5 [01:25<00:21, 21.24s/it]

Final Test Accuracy: 0.5031


100%|██████████| 5/5 [01:48<00:00, 21.68s/it]

Final Test Accuracy: 0.4784
Mean Accuracy: 0.5074 ± 0.0243





In [4]:
from tqdm import tqdm

kf = KFold(n_splits=5, shuffle=True, random_state=25)
accuracies = []
for train_index, test_index in tqdm(kf.split(X_balanced['atac']), total=5):
    X_train, X_test = [X_balanced['atac'][i] for i in train_index], [X_balanced['atac'][i] for i in test_index]
    y_train, y_test = [y_balanced[i] for i in train_index], [y_balanced[i] for i in test_index]
    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
    accuracies.append(train_model(X_train_normalized, y_train, X_test_normalized, y_test))
print(f'Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}')


 20%|██        | 1/5 [00:26<01:46, 26.65s/it]

Final Test Accuracy: 0.5617


 40%|████      | 2/5 [00:52<01:18, 26.08s/it]

Final Test Accuracy: 0.5432


 60%|██████    | 3/5 [01:16<00:50, 25.34s/it]

Final Test Accuracy: 0.5247


 80%|████████  | 4/5 [01:44<00:26, 26.32s/it]

Final Test Accuracy: 0.5463


100%|██████████| 5/5 [02:08<00:00, 25.64s/it]

Final Test Accuracy: 0.5185
Mean Accuracy: 0.5389 ± 0.0156



