In [1]:
# To install the needed packages, uncomment the following two lines:
print("START")
!python3 -m pip install -r requirements.txt
!python3 -m pip install lightning
# NOTE: Be sure to also install git lfs! Installation instructions can be found here:
# https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage?platform=mac

Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the '/opt/apps/intel19/python3/3.9.7/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable




You should consider upgrading via the '/opt/apps/intel19/python3/3.9.7/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# the name of the pretrained model (see possible_pretrained_names below for options), use None if 
# training from scratch
pretrained_model_name = "hyenadna-tiny-1k-seqlen" 
# path to your training data
train_data_path = '/work/09855/hurleyqi/ls6/temp/enhancer_prediction_model_yilab/tewhey_data_parser/tewhey_data_training_set'
# path to your test data
test_data_path = '/work/09855/hurleyqi/ls6/temp/enhancer_prediction_model_yilab/tewhey_data_parser/tewhey_data_testing_set'  
# number of labels (regression heads) you want to use
num_labels = 3

# the ratio of train data you want to include, where 0.0 is nothing and 1.0 is everything 
# can be made smaller to make testing faster
train_data_subset_ratio = 1.0
# the ratio of test data you want to include, where 0.0 is nothing and 1.0 is everything
# can be made smaller to make testing faster
test_data_subset_ratio = 1.0

In [3]:
print("checks")
# checks
import os
def is_valid_path(path):
    return os.path.exists(path)
possible_pretrained_names = ['hyenadna-tiny-1k-seqlen', 
                             'hyenadna-small-32k-seqlen', 
                             'hyenadna-medium-160k-seqlen', 
                             'hyenadna-medium-450k-seqlen', 
                             'hyenadna-large-1m-seqlen']

assert pretrained_model_name == None or \
       pretrained_model_name in possible_pretrained_names
assert is_valid_path(train_data_path)
assert is_valid_path(test_data_path)
for ratio in [train_data_subset_ratio, test_data_subset_ratio]:
    assert ratio >= 0.0 and ratio <= 1.0
assert num_labels > 0

In [4]:
from lightning import LightningModule
import torch
from transformers import DataCollatorWithPadding
import wandb
import evaluate
import datasets
from datasets import load_dataset
from scipy.stats import pearsonr
from torch.utils.data import DataLoader, Dataset, Subset
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np

In [5]:
# these take a little longer so they are in a separate cell
from huggingface import HyenaDNAPreTrainedModel
from standalone_hyenadna import CharacterTokenizer

In [6]:
class CustomDNADataset(Dataset):
    def __init__(self, csv_file, tokenizer, num_labels, max_length=1000, use_padding=True):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.use_padding = use_padding

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sequence = self.data.iloc[idx, 0]
        label_data = []
        for i in range(num_labels):
            label_data.append(self.data.iloc[idx, i+1])

        # tokenize sequence
        tokenized_sequence = self.tokenizer(sequence, padding=self.use_padding, truncation=True, max_length=self.max_length, return_tensors="pt")
        
        sample = {'input_ids': tokenized_sequence['input_ids'][0]}
        for i, label in enumerate(label_data):
            sample[f'label{i+1}'] = torch.tensor(label)

        return sample


In [7]:
# Create a dataloader that only has 1/10th of the training data set
def create_subset_loader(ds, batch_size, shuffle=False, subset_fraction=1.0):
    torch.manual_seed(0)
    subset_size = int(len(ds) * subset_fraction)
    indices = list(range(len(ds)))
    np.random.shuffle(indices)
    subset_indices = indices[:subset_size]
    ds_subset = Subset(ds, subset_indices)
    train_loader = DataLoader(ds_subset, batch_size=batch_size, shuffle=shuffle)
    return train_loader


In [8]:
def train(model, device, train_loader, optimizer, epoch, loss_fn, num_labels, log_interval=100):
    """Training loop."""
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        data = batch['input_ids'].to(device).long()
        targets_arr = []
        for i in range(num_labels):
            targets_arr.append(batch[f'label{i+1}'].to(device).float())
        optimizer.zero_grad()
        output = model(data)
        targets = torch.stack([target for target in targets_arr], dim=1)
        loss = loss_fn(output, targets)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader, loss_fn, num_labels):
    """Test loop."""
    model.eval()
    test_loss = 0
    correct = 0
    all_preds = [[] for _ in range(num_labels)]
    all_labels = [[] for _ in range(num_labels)]
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            data = batch['input_ids'].to(device).long()
            targets_arr = []
            for i in range(num_labels):
                targets_arr.append(batch[f'label{i+1}'].to(device).float())
            targets = torch.stack([target for target in targets_arr], dim=1)
            output = model(data)
            test_loss += loss_fn(output, targets).item()  # sum up batch loss
            # collect all predictions and actual labels
            preds = output.detach().cpu().numpy()
            labels = targets.detach().cpu().numpy()
            for i in range(num_labels):
                all_preds[i].extend(preds[:, i])
                all_labels[i].extend(labels[:, i])

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}'.format(test_loss))
    
    # calculate and print Pearson correlation coefficient for each target
    for i in range(num_labels):
        r_value, p_value = pearsonr(np.array(all_preds[i]), np.array(all_labels[i]))
        print(f'Label {i+1} PCC: {r_value:.4f}')

In [9]:
import json
import os
import subprocess
import transformers
from transformers import PreTrainedModel, AutoModelForCausalLM, PretrainedConfig
import torch.nn as nn

def run_train(num_labels):
    print("start train")
    # experiment settings:
    num_epochs = 100 # ~100 seems fine
    batch_size = 256
    learning_rate = 6e-4  # good default for Hyena
    weight_decay = 0.1

    # these are used for the regression head
    use_head = True
    n_classes = num_labels

    # you can override with your own backbone config here if you want,
    # otherwise we'll load the HF one by default
    backbone_cfg = None

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("Using device:", device)

    # instantiate the model (pretrained here)
    if pretrained_model_name in ['hyenadna-tiny-1k-seqlen', 
                                 'hyenadna-small-32k-seqlen', 
                                 'hyenadna-medium-160k-seqlen', 
                                 'hyenadna-medium-450k-seqlen', 
                                 'hyenadna-large-1m-seqlen']:
        # use the pretrained Huggingface wrapper instead
        model = HyenaDNAPreTrainedModel.from_pretrained(
            './checkpoints',
            pretrained_model_name,
            download=False,
            config=backbone_cfg,
            device=device,
            use_head=use_head,
            n_classes=n_classes,
        )

    # from scratch
    else:
        model = HyenaDNAPreTrainedModel(**backbone_cfg, use_head=use_head, n_classes=n_classes)
        
    max_lengths = {
        'hyenadna-tiny-1k-seqlen': 1024,
        'hyenadna-small-32k-seqlen': 32768,
        'hyenadna-medium-160k-seqlen': 160000,
        'hyenadna-medium-450k-seqlen': 450000,  
        'hyenadna-large-1m-seqlen': 1_000_000,
    }

    # create tokenizer
    tokenizer = CharacterTokenizer(
        characters=['A', 'C', 'G', 'T', 'N'],  # add DNA characters, N is uncertain
        model_max_length=max_lengths[pretrained_model_name], 
        add_special_tokens=False,  # we handle special tokens elsewhere
        padding_side='left', # since HyenaDNA is causal, we pad on the left
    )

    ds_train = CustomDNADataset(train_data_path, tokenizer, max_lengths[pretrained_model_name])
    ds_test = CustomDNADataset(test_data_path, tokenizer, max_lengths[pretrained_model_name])
    
    train_loader = create_subset_loader(ds_train, batch_size, shuffle=True, subset_fraction=train_data_subset_ratio)
    test_loader = create_subset_loader(ds_test, batch_size=batch_size, shuffle=False, subset_fraction=test_data_subset_ratio)

    # loss function
    loss_fn = nn.MSELoss()

    # create optimizer
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.to(device)
    for epoch in range(num_epochs):
        train(model, device, train_loader, optimizer, epoch, loss_fn, num_labels)
        test(model, device, test_loader, loss_fn, num_labels)
        optimizer.step()


In [None]:
run_train(num_labels)

Using device: cuda
pretrained model name:  ./checkpoints/hyenadna-tiny-1k-seqlen
download val  False
Loaded pretrained weights ok!

Test set: Average loss: 0.0057
Label 1 PCC: 0.6026
Label 2 PCC: 0.7131

Test set: Average loss: 0.0054
Label 1 PCC: 0.6327
Label 2 PCC: 0.7323

Test set: Average loss: 0.0052
Label 1 PCC: 0.6509
Label 2 PCC: 0.7444

Test set: Average loss: 0.0053
Label 1 PCC: 0.6460
Label 2 PCC: 0.7378

Test set: Average loss: 0.0054
Label 1 PCC: 0.6402
Label 2 PCC: 0.7326

Test set: Average loss: 0.0054
Label 1 PCC: 0.6315
Label 2 PCC: 0.7285


In [None]:
# after getting results try with the average embedding strategy