<a href="https://colab.research.google.com/github/Navya003/genomicBERT/blob/main/model_species.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets optuna

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/

In [None]:
import zipfile
# Paths
dataset_zip = '/content/drive/MyDrive/genomic_bert/final_ds_species.zip'
model_zip = '/content/drive/MyDrive/genomic_bert/genome_bert_base.zip'
output_dir = '/content/'

# Unzip dataset
with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

# Unzip model
with zipfile.ZipFile(model_zip, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

# Tokenizer
tokenizer_path = '/content/drive/MyDrive/genomic_bert/4k_vocab_dna.json'

In [None]:
# === OPTUNA HYPERPARAMETER TUNING (Optimized) ===
import optuna
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    PreTrainedTokenizerFast
)
from datasets import load_from_disk
from sklearn.metrics import f1_score
import numpy as np
import os

# Disable WandB
os.environ["WANDB_DISABLED"] = "true"

# === Load and preprocess data only ONCE ===
def preprocess_data():
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_file="/content/drive/MyDrive/genomic_bert/4k_vocab_dna.json",
        special_tokens=["<s>", "</s>", "<unk>", "<pad>", "<mask>"],
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>"
    )

    dataset = load_from_disk("/content/final_ds")
    train_dataset = dataset['train'].select(range(500))
    eval_dataset = dataset['train'].select(range(500, 800))

    def preprocess(examples):
        tokenized = tokenizer(examples["text"], truncation=True, padding='max_length', max_length=40)
        tokenized["label"] = examples["label"]
        return tokenized

    train_dataset = train_dataset.map(preprocess, batched=True)
    eval_dataset = eval_dataset.map(preprocess, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return train_dataset, eval_dataset

train_dataset, eval_dataset = preprocess_data()

# === Metrics Function ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"f1": f1_score(labels, preds, average="macro")}

# === Objective Function ===
def objective(trial):
    model = AutoModelForSequenceClassification.from_pretrained(
        "/content/genome_bert_base"  # Change to smaller model if needed
    )

    training_args = TrainingArguments(
        output_dir=f"./optuna_results/trial_{trial.number}",
        per_device_train_batch_size=trial.suggest_categorical("batch_size", [4, 8, 16]),
        per_device_eval_batch_size=trial.suggest_categorical("batch_size", [4, 8, 16]),
        num_train_epochs=trial.suggest_int("epochs", 2, 4),
        learning_rate=trial.suggest_float("lr", 1e-5, 5e-5, log=True),
        logging_dir="./logs",
        logging_steps=10,
        disable_tqdm=False,
        report_to=[]  # disables wandb, tensorboard, etc.
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_f1"]

# === Run Optuna Study ===
def get_best_hyperparams():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10, timeout=3600)  # max 1 hour
    print("Best trial params:", study.best_trial.params)
    return study.best_trial.params

# === Execute ===
best_params = get_best_hyperparams()

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

[I 2025-05-09 07:31:15,476] A new study created in memory with name: no-name-f8add650-244f-40b6-bd2e-72fcab7e94b0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encode

Step,Training Loss
10,0.5428
20,0.5554
30,0.5816
40,0.4191
50,0.4073
60,0.3277
70,0.2488
80,0.4484
90,0.2421
100,0.2903


[I 2025-05-09 07:31:34,627] Trial 0 finished with value: 0.8152186579214595 and parameters: {'batch_size': 8, 'epochs': 2, 'lr': 2.1548711095947094e-05}. Best is trial 0 with value: 0.8152186579214595.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.

Step,Training Loss
10,0.6589
20,0.4201
30,0.4721
40,0.7026
50,0.6818
60,0.4277
70,0.4608
80,0.3212
90,0.6871
100,0.2473


[I 2025-05-09 07:32:06,809] Trial 1 finished with value: 0.8038079934228965 and parameters: {'batch_size': 4, 'epochs': 3, 'lr': 1.2799664745528377e-05}. Best is trial 0 with value: 0.8152186579214595.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.

Step,Training Loss
10,0.4923
20,0.4677
30,0.4006
40,0.3379
50,0.2555
60,0.3291


[I 2025-05-09 07:32:36,507] Trial 2 finished with value: 0.7603229333109074 and parameters: {'batch_size': 16, 'epochs': 2, 'lr': 1.1705014105740509e-05}. Best is trial 0 with value: 0.8152186579214595.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1

Step,Training Loss
10,0.5552
20,0.4464
30,0.3723
40,0.3001
50,0.3168
60,0.3123
70,0.1837
80,0.2316
90,0.2018


[I 2025-05-09 07:33:19,582] Trial 3 finished with value: 0.8392605987542696 and parameters: {'batch_size': 16, 'epochs': 3, 'lr': 2.674012489939279e-05}. Best is trial 3 with value: 0.8392605987542696.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.

Step,Training Loss
10,0.4897
20,0.4574
30,0.3959
40,0.3466
50,0.2421
60,0.306
70,0.2172
80,0.335
90,0.2326


[I 2025-05-09 07:33:46,412] Trial 4 finished with value: 0.8005829114894922 and parameters: {'batch_size': 16, 'epochs': 3, 'lr': 1.399643862542267e-05}. Best is trial 3 with value: 0.8392605987542696.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.

Step,Training Loss
10,0.5753
20,0.4619
30,0.3999
40,0.3193
50,0.2896
60,0.2763
70,0.1793
80,0.239
90,0.1981
100,0.2735


[I 2025-05-09 07:34:25,103] Trial 5 finished with value: 0.8271889400921659 and parameters: {'batch_size': 16, 'epochs': 4, 'lr': 3.23288703395671e-05}. Best is trial 3 with value: 0.8392605987542696.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.a

Step,Training Loss
10,0.7731
20,0.511
30,0.5233
40,0.6249
50,0.6915
60,0.4942
70,0.5811
80,0.6194
90,0.6277
100,0.3501


[I 2025-05-09 07:35:24,826] Trial 6 finished with value: 0.8117436127297176 and parameters: {'batch_size': 4, 'epochs': 2, 'lr': 4.447840471382692e-05}. Best is trial 3 with value: 0.8392605987542696.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.a

Step,Training Loss
10,0.5442
20,0.4772
30,0.4989
40,0.418
50,0.3988
60,0.3291
70,0.2544
80,0.4869
90,0.2344
100,0.2862


[I 2025-05-09 07:35:50,172] Trial 7 finished with value: 0.8225011205737338 and parameters: {'batch_size': 8, 'epochs': 3, 'lr': 1.2006177130527593e-05}. Best is trial 3 with value: 0.8392605987542696.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.

Step,Training Loss
10,0.4928
20,0.4493
30,0.3883
40,0.341
50,0.2403
60,0.3047


[I 2025-05-09 07:36:04,706] Trial 8 finished with value: 0.7916625329867657 and parameters: {'batch_size': 16, 'epochs': 2, 'lr': 1.5583157753623068e-05}. Best is trial 3 with value: 0.8392605987542696.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1

Step,Training Loss
10,0.4808
20,0.4415
30,0.3967
40,0.3386
50,0.2534
60,0.3114


[I 2025-05-09 07:36:58,807] Trial 9 finished with value: 0.772499026459798 and parameters: {'batch_size': 16, 'epochs': 2, 'lr': 1.2450710780134038e-05}. Best is trial 3 with value: 0.8392605987542696.


Best trial params: {'batch_size': 16, 'epochs': 3, 'lr': 2.674012489939279e-05}


In [None]:
import os
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from sklearn.metrics import matthews_corrcoef

# Load Dataset
from datasets import load_from_disk
dataset = load_from_disk("/content/final_ds")
train_df = pd.DataFrame(dataset['train'])
print("Train dataset shape:", train_df.shape)
test_df = pd.DataFrame(dataset['test'])
print("Test dataset shape:", test_df.shape)

# Combine all for CV
#df = pd.concat([train_df, test_df]).reset_index(drop=True)
df = train_df.iloc[800:].reset_index(drop=True)
#df = df.sample(frac=0.2, random_state=42).reset_index(drop=True)
df['seq_length'] = df['text'].apply(len)
print(f"• Max sequence length: {df['seq_length'].max()}")
print(f"• Min sequence length: {df['seq_length'].min()}")


# Tokenizer setup
tokenizer_path = '/content/drive/MyDrive/genomic_bert/4k_vocab_dna.json'
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=tokenizer_path,
    special_tokens=["<s>", "</s>", "<unk>", "<pad>", "<mask>"],
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    sep_token="<sep>",
    pad_token="<pad>",
    cls_token="<cls>",
    mask_token="<mask>"
)

# Model path
model_path = "/content/genome_bert_base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Parameters
num_labels = df["label"].nunique()
num_folds = 5
batch_size = 16
num_epochs = 3
learning_rate = 2.674012489939279e-05
# batch_size = best_params["batch_size"]
# num_epochs = best_params["epochs"]
# learning_rate = best_params["lr"]
max_seq_length = 256

# Dataset Class
class TFBS_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Cross-validation setup
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
fold_accuracies = []
fold_f1s = []
fold_mccs = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df['text'], df['label'])):
    print(f"\n=== Fold {fold + 1} ===")

    # Split data
    train_texts = df['text'].iloc[train_idx].tolist()
    train_labels = df['label'].iloc[train_idx].tolist()
    val_texts = df['text'].iloc[val_idx].tolist()
    val_labels = df['label'].iloc[val_idx].tolist()

    # Prepare datasets
    train_dataset = TFBS_Dataset(train_texts, train_labels, tokenizer, max_seq_length)
    val_dataset = TFBS_Dataset(val_texts, val_labels, tokenizer, max_seq_length)

    # Loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}")
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average="macro")
    print(f"Fold {fold + 1} Accuracy: {acc:.4f}, F1 Score: {f1:.4f}, MCC: {mcc:.4f}")
    fold_accuracies.append(acc)
    fold_f1s.append(f1)
    fold_mccs.append(mcc)

# Summary
print("\n=== Cross-validation Results ===")
print(f"Average Accuracy: {sum(fold_accuracies)/num_folds:.4f}")
print(f"Average F1 Score: {sum(fold_f1s)/num_folds:.4f}")
print(f"Average MCC: {sum(fold_mccs)/num_folds:.4f}")

Train dataset shape: (11386, 2)
Test dataset shape: (1266, 2)
• Max sequence length: 9999
• Min sequence length: 200

=== Fold 1 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 530/530 [05:37<00:00,  1.57it/s]


Epoch 2


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Epoch 3


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Fold 1 Accuracy: 0.9721, F1 Score: 0.9621

=== Fold 2 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 530/530 [05:37<00:00,  1.57it/s]


Epoch 2


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Epoch 3


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Fold 2 Accuracy: 0.9580, F1 Score: 0.9374

=== Fold 3 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 530/530 [05:37<00:00,  1.57it/s]


Epoch 2


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Epoch 3


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Fold 3 Accuracy: 0.9778, F1 Score: 0.9686

=== Fold 4 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 530/530 [05:37<00:00,  1.57it/s]


Epoch 2


100%|██████████| 530/530 [05:37<00:00,  1.57it/s]


Epoch 3


100%|██████████| 530/530 [05:36<00:00,  1.58it/s]


Fold 4 Accuracy: 0.9839, F1 Score: 0.9773

=== Fold 5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 530/530 [05:37<00:00,  1.57it/s]


Epoch 2


100%|██████████| 530/530 [05:36<00:00,  1.57it/s]


Epoch 3


100%|██████████| 530/530 [05:36<00:00,  1.58it/s]


Fold 5 Accuracy: 0.9882, F1 Score: 0.9834

=== Cross-validation Results ===
Average Accuracy: 0.9760
Average F1 Score: 0.9657


In [None]:
import os
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Load Dataset
from datasets import load_from_disk
dataset = load_from_disk("/content/final_ds")
train_df = pd.DataFrame(dataset['train'])
print("Train dataset shape:", train_df.shape)
test_df = pd.DataFrame(dataset['test'])
print("Test dataset shape:", test_df.shape)

# Combine all for CV
df = pd.concat([train_df, test_df]).reset_index(drop=True)
#df = df.sample(frac=0.2, random_state=42).reset_index(drop=True)
df['seq_length'] = df['text'].apply(len)
print(f"• Max sequence length: {df['seq_length'].max()}")
print(f"• Min sequence length: {df['seq_length'].min()}")
df = df.sample(frac=0.8, random_state=42).reset_index(drop=True)
# Tokenizer setup
tokenizer_path = '/content/drive/MyDrive/genomic_bert/4k_vocab_dna.json'
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=tokenizer_path,
    special_tokens=["<s>", "</s>", "<unk>", "<pad>", "<mask>"],
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    sep_token="<sep>",
    pad_token="<pad>",
    cls_token="<cls>",
    mask_token="<mask>"
)

# Model path
model_path = "/content/genome_bert_base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Parameters
num_labels = df["label"].nunique()
num_folds = 5
batch_size = 4
num_epochs = 2
learning_rate = 2e-5
max_seq_length = 256

# Dataset Class
class TFBS_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Cross-validation setup
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
fold_accuracies = []
fold_f1s = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df['text'], df['label'])):
    print(f"\n=== Fold {fold + 1} ===")

    # Split data
    train_texts = df['text'].iloc[train_idx].tolist()
    train_labels = df['label'].iloc[train_idx].tolist()
    val_texts = df['text'].iloc[val_idx].tolist()
    val_labels = df['label'].iloc[val_idx].tolist()

    # Prepare datasets
    train_dataset = TFBS_Dataset(train_texts, train_labels, tokenizer, max_seq_length)
    val_dataset = TFBS_Dataset(val_texts, val_labels, tokenizer, max_seq_length)

    # Loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}")
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average="macro")

    print(f"Fold {fold + 1} Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    fold_accuracies.append(acc)
    fold_f1s.append(f1)

# Summary
print("\n=== Cross-validation Results ===")
print(f"Average Accuracy: {sum(fold_accuracies)/num_folds:.4f}")
print(f"Average F1 Score: {sum(fold_f1s)/num_folds:.4f}")

Train dataset shape: (11386, 2)
Test dataset shape: (1266, 2)
• Max sequence length: 9999
• Min sequence length: 200

=== Fold 1 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 2025/2025 [06:38<00:00,  5.08it/s]


Epoch 2


100%|██████████| 2025/2025 [06:45<00:00,  5.00it/s]


Fold 1 Accuracy: 0.9417, F1 Score: 0.9234

=== Fold 2 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 2025/2025 [06:46<00:00,  4.99it/s]


Epoch 2


100%|██████████| 2025/2025 [06:45<00:00,  5.00it/s]


Fold 2 Accuracy: 0.9521, F1 Score: 0.9267

=== Fold 3 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 2025/2025 [06:46<00:00,  4.98it/s]


Epoch 2


100%|██████████| 2025/2025 [06:46<00:00,  4.98it/s]


Fold 3 Accuracy: 0.9644, F1 Score: 0.9476

=== Fold 4 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 2025/2025 [06:46<00:00,  4.98it/s]


Epoch 2


100%|██████████| 2025/2025 [06:45<00:00,  4.99it/s]


Fold 4 Accuracy: 0.9644, F1 Score: 0.9473

=== Fold 5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/genome_bert_base and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.at

Epoch 1


100%|██████████| 2025/2025 [06:46<00:00,  4.98it/s]


Epoch 2


100%|██████████| 2025/2025 [06:45<00:00,  4.99it/s]


Fold 5 Accuracy: 0.9783, F1 Score: 0.9689

=== Cross-validation Results ===
Average Accuracy: 0.9602
Average F1 Score: 0.9428
