In [1]:
import os
import torch
import numpy as np
import sklearn.metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed
from torch.utils.data import Dataset, DataLoader
from genomic_benchmarks.data_check import list_datasets
from genomic_benchmarks.loc2seq import download_dataset
from pathlib import Path
from tqdm import tqdm

class GenomicDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            sequence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_genomic_benchmark_data(dataset_name, split='test'):
    dataset_path = download_dataset(dataset_name)
    sequences = []
    labels = []
    split_path = Path(dataset_path) / split
    class_dirs = sorted([d for d in split_path.iterdir() if d.is_dir()])
    class_names = [d.name for d in class_dirs]
    for label_idx, class_dir in enumerate(class_dirs):
        for seq_file in class_dir.glob('*.txt'):
            with open(seq_file, 'r') as f:
                sequence = f.read().strip()
                sequences.append(sequence)
                labels.append(label_idx)
    return sequences, labels, class_names

def calculate_metrics(predictions, labels):
    return {
        "accuracy": sklearn.metrics.accuracy_score(labels, predictions),
        "f1_macro": sklearn.metrics.f1_score(labels, predictions, average="macro", zero_division=0),
        "matthews_correlation": sklearn.metrics.matthews_corrcoef(labels, predictions),
        "precision": sklearn.metrics.precision_score(labels, predictions, average="macro", zero_division=0),
        "recall": sklearn.metrics.recall_score(labels, predictions, average="macro", zero_division=0),
    }

def benchmark_on_cohn_dataset(
    model_path="./omni_dna_promoter_classifier",
    dataset_name="human_enhancers_cohn",
    seed=42,
    batch_size=16,
    max_length=512
):
    print(f"Benchmarking: {dataset_name}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    set_seed(seed)
    
    test_sequences, test_labels, class_names = load_genomic_benchmark_data(dataset_name, split='test')
    
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        trust_remote_code=True
    )
    model.to(device)
    model.eval()
    
    test_dataset = GenomicDataset(test_sequences, test_labels, tokenizer, max_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    metrics = calculate_metrics(all_predictions, all_labels)
    
    print("\n" + "="*50)
    for metric, value in metrics.items():
        print(f"{metric.upper():<25}: {value:.4f}")
    print("="*50)
    
    return metrics

if __name__ == "__main__":
    if os.path.exists("./omni_dna_promoter_classifier"):
        benchmark_on_cohn_dataset()
    else:
        print("Model directory not found.")

  from .autonotebook import tqdm as notebook_tqdm


Benchmarking: human_enhancers_cohn




num_labels: 2


Testing: 100%|██████████| 435/435 [00:42<00:00, 10.20it/s]


ACCURACY                 : 0.6228
F1_MACRO                 : 0.6099
MATTHEWS_CORRELATION     : 0.2636
PRECISION                : 0.6415
RECALL                   : 0.6228



