In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,5,6,7"

## Libraries and Imports

In [2]:
import pandas as pd

from llms.nucl_classifier.bert import NuclBERT
from schemas.train_params import TrainParams
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from tqdm import tqdm

## Params and Files

In [3]:
seed = 12

csv_path = "nucl-500.csv"
pretrained_model_name = "NuclBERT"

In [4]:
csv_path = f"./storage/data/processed/{csv_path}"
output_path = f"./storage/models/tuned/{pretrained_model_name}"
checkpoint = "storage/models/base/bert"

## Reading Dataset

In [5]:
df = pd.read_csv(csv_path, keep_default_na=False)

## Loading the Model

In [6]:
llm = NuclBERT(
  checkpoint=checkpoint,
  seed=seed
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at storage/models/base/bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Data Processing

In [7]:
data = df.to_dict(orient="records")

In [8]:
all_dataset = []
for record in tqdm(data):
  example = llm.build_input(
    sequence=record["sequence"],
    target=record["target"],
    organism=record.get("organism")
	)
  all_dataset.append(example)

train_dataset, test_dataset = train_test_split(
  all_dataset,
  test_size=0.05,
  random_state=seed,
  shuffle=True
)

100%|██████████| 57687/57687 [00:00<00:00, 2312388.92it/s]


In [9]:
print("Train Dataset Len:", len(train_dataset))
print("Test Dataset Len:", len(test_dataset))

Train Dataset Len: 54802
Test Dataset Len: 2885


In [10]:
llm.train(
  dataset=train_dataset,
  params=TrainParams(
    epochs=3,
    batch_size=64,
    gradient_accumulation=1,
    lr=5e-6,
    logging_steps=5000
	)
)

2025-10-21 10:43:50,769 - INFO - [32mPreparing dataset...[0m
100%|██████████| 54802/54802 [1:26:19<00:00, 10.58it/s]   
2025-10-21 12:26:42,079 - INFO - [32mDataset prepared![0m
2025-10-21 12:26:51,547 - INFO - [32mStarting training...[0m


Step,Training Loss
5000,0.6045
10000,0.522
15000,0.4906
20000,0.471
25000,0.4558
30000,0.443
35000,0.4342
40000,0.4255
45000,0.4164
50000,0.4084


2025-10-21 17:32:04,975 - INFO - [32mTraining complete. You may save the model for later usage.[0m


In [11]:
llm.save_pretrained(output_path)

2025-10-21 17:32:05,162 - INFO - [32mAttempting to save model at './storage/models/tuned/NuclBERT'[0m
2025-10-21 17:32:09,426 - INFO - [32mSuccessfully saved at './storage/models/tuned/NuclBERT'[0m


In [12]:
refs = []
preds = []

for data in tqdm(test_dataset):
  answer = llm.generate(data)
  preds.append(answer)
  refs.append(data["target"])

100%|██████████| 2885/2885 [1:45:58<00:00,  2.20s/it]


In [13]:
all_refs = []
all_preds = []
for ref, pred in zip(refs, preds):
  min_len = min(len(ref), len(pred))
  all_refs.extend(list(ref[:min_len]))
  all_preds.extend(list(pred[:min_len]))

acc = accuracy_score(all_refs, all_preds)

print("Accuracy: ", acc)

labels = ["I", "E", "U"]
precision, recall, f1, support = precision_recall_fscore_support(
	all_refs, all_preds, labels=labels, average=None, zero_division=0
)

for i, label in enumerate(labels):
    print(f"Class: '{label}'")
    print(f"  - Precision: {precision[i]:.4f}")
    print(f"  - Recall:   {recall[i]:.4f}")
    print(f"  - F1-Score: {f1[i]:.4f}\n")

Accuracy:  0.8183934384838736
Class: 'I'
  - Precision: 0.6455
  - Recall:   0.7164
  - F1-Score: 0.6791

Class: 'E'
  - Precision: 0.8236
  - Recall:   0.9214
  - F1-Score: 0.8698

Class: 'U'
  - Precision: 0.8522
  - Recall:   0.6974
  - F1-Score: 0.7671

