In [1]:
import os
import torch
import numpy as np
import sklearn.metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed, AutoConfig
from torch.utils.data import Dataset, DataLoader
from genomic_benchmarks.data_check import list_datasets
from genomic_benchmarks.loc2seq import download_dataset
from pathlib import Path
from tqdm import tqdm

class GenomicDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            sequence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_genomic_benchmark_data(dataset_name, split='train'):
    dataset_path = download_dataset(dataset_name)
    sequences = []
    labels = []
    
    split_path = Path(dataset_path) / split
    class_dirs = sorted([d for d in split_path.iterdir() if d.is_dir()])
    
    for label_idx, class_dir in enumerate(class_dirs):
        for seq_file in class_dir.glob('*.txt'):
            with open(seq_file, 'r') as f:
                sequence = f.read().strip()
                sequences.append(sequence)
                labels.append(label_idx)
    
    return sequences, labels

def calculate_metrics(predictions, labels):
    return {
        "accuracy": sklearn.metrics.accuracy_score(labels, predictions),
        "f1": sklearn.metrics.f1_score(labels, predictions, average="macro", zero_division=0),
        "matthews_correlation": sklearn.metrics.matthews_corrcoef(labels, predictions),
        "precision": sklearn.metrics.precision_score(labels, predictions, average="macro", zero_division=0),
        "recall": sklearn.metrics.recall_score(labels, predictions, average="macro", zero_division=0),
    }

def evaluate_base_model(
    dataset_name="human_nontata_promoters",
    model_name="zehui127/Omni-DNA-116M",
    seed=42,
    batch_size=32,
    max_length=128
):
    print(f"Evaluating Base Omni-DNA Model on {dataset_name}")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    
    set_seed(seed)
    
    print("Loading dataset...")
    test_sequences, test_labels = load_genomic_benchmark_data(dataset_name, split='test')
    
    num_classes = len(set(test_labels))
    print(f"Number of classes: {num_classes}")
    print(f"Test samples: {len(test_sequences)}")
    
    print("Loading tokenizer and base model...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.model_max_length = max_length
    
    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    config.num_labels = num_classes
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config,
        trust_remote_code=True,
        ignore_mismatched_sizes=True
    )
    model.to(device)
    model.eval()
    
    print("Preparing test dataset...")
    test_dataset = GenomicDataset(test_sequences, test_labels, tokenizer, max_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    print("Evaluating base model (no training)...")
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    metrics = calculate_metrics(all_predictions, all_labels)
    
    print("\n" + "="*50)
    print("BASE MODEL RESULTS (NO TRAINING)")
    print("="*50)
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print("="*50)
    
    return metrics

if __name__ == "__main__":
    print("Available Genomic Benchmark datasets:")
    datasets = list_datasets()
    for i, ds in enumerate(datasets):
        print(f"{i+1}. {ds}")
    
    print("\n" + "="*50)
    print("Base Model Evaluation (Zero-Shot)")
    print("="*50 + "\n")
    
    metrics = evaluate_base_model(
        dataset_name="human_nontata_promoters",
        model_name="zehui127/Omni-DNA-116M",
        seed=42,
        batch_size=32,
        max_length=128
    )
    
    print("\nEvaluation complete!")

  from .autonotebook import tqdm as notebook_tqdm


Available Genomic Benchmark datasets:
1. human_ensembl_regulatory
2. human_enhancers_cohn
3. demo_coding_vs_intergenomic_seqs
4. demo_human_or_worm
5. human_enhancers_ensembl
6. drosophila_enhancers_stark
7. human_ocr_ensembl
8. human_nontata_promoters
9. dummy_mouse_enhancers_ensembl

Base Model Evaluation (Zero-Shot)

Evaluating Base Omni-DNA Model on human_nontata_promoters
Using device: cuda
GPU: NVIDIA GeForce RTX 4090
Loading dataset...




Number of classes: 2
Test samples: 9034
Loading tokenizer and base model...


Some weights of OLMoForSequenceCLS were not initialized from the model checkpoint at zehui127/Omni-DNA-116M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


num_labels: 2
Preparing test dataset...
Evaluating base model (no training)...


Evaluating: 100%|██████████| 283/283 [00:11<00:00, 24.19it/s]


BASE MODEL RESULTS (NO TRAINING)
accuracy: 0.4567
f1: 0.3153
matthews_correlation: 0.0172
precision: 0.6127
recall: 0.5007

Evaluation complete!



