## Libraries and Imports

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
import pandas as pd
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from llms.exin_classifier.bert import ExInClassifierBERT
from schemas.train_params import TrainParams

## Params and Files

In [3]:
seed = 12

csv_path = "exin-256.csv"
pretrained_model_name = "ExInBERTModel"

In [4]:
csv_path = f"./storage/data/processed/{csv_path}"
output_path = f"./storage/models/tuned/{pretrained_model_name}"
checkpoint = f"./storage/models/base/bert"

## Reading Dataset

In [5]:
df = pd.read_csv(csv_path, keep_default_na=False)

## Loading the Model

In [6]:
llm = ExInClassifierBERT(
  checkpoint=checkpoint,
  seed=seed
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./storage/models/base/bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Data Processing

In [7]:
data = df.to_dict(orient="records")

In [8]:
all_dataset = []
for record in tqdm(data):
	example = llm.build_input(
		sequence=record["sequence"],
		target=record.get("target"),
		organism=record.get("organism"),
		gene=record.get("gene"),
		before=record.get("before"),
		after=record.get("after")
	)
	all_dataset.append(example)

train_dataset, test_dataset = train_test_split(
	all_dataset,
	test_size=0.05,
	random_state=seed,
	shuffle=True
)

100%|██████████| 6600498/6600498 [00:04<00:00, 1519350.55it/s]


In [9]:
print("Train Dataset Len:", len(train_dataset))
print("Test Dataset Len:", len(test_dataset))

Train Dataset Len: 6270473
Test Dataset Len: 330025


In [10]:
llm.train(
  dataset=train_dataset,
  params=TrainParams(
    epochs=1,
    batch_size=128,
    gradient_accumulation=1,
    lr=1e-5,
		logging_steps=500
	)
)

2025-10-14 14:07:02,048 - INFO - [32mPreparing dataset...[0m
100%|██████████| 6270473/6270473 [38:43<00:00, 2699.29it/s]  
2025-10-14 14:49:58,445 - INFO - [32mDataset prepared![0m
2025-10-14 14:50:07,411 - INFO - [32mStarting training...[0m


Step,Training Loss
500,0.1731
1000,0.0368
1500,0.0239
2000,0.0186
2500,0.0141
3000,0.0103
3500,0.0087
4000,0.0079
4500,0.0057
5000,0.0069


2025-10-14 22:26:47,369 - INFO - [32mTraining complete. You may save the model for later usage.[0m


In [11]:
llm.save_pretrained(output_path)

2025-10-14 22:26:47,575 - INFO - [32mAttempting to save model at './storage/models/tuned/ExInBERTModel'[0m
2025-10-14 22:26:51,772 - INFO - [32mSuccessfully saved at './storage/models/tuned/ExInBERTModel'[0m


In [12]:
y_true = []
y_pred = []

for data in tqdm(test_dataset):
	answer = llm.generate(data)
	y_pred.append(answer)
	y_true.append(data["target"])

100%|██████████| 330025/330025 [37:29<00:00, 146.74it/s]


In [13]:
print("INTRON class:")
print("  Precision :", precision_score(y_true, y_pred, pos_label="INTRON"))
print("  Recall    :", recall_score(y_true, y_pred, pos_label="INTRON"))
print("  F1        :", f1_score(y_true, y_pred, pos_label="INTRON"))
print()
print("EXON class:")
print("  Precision :", precision_score(y_true, y_pred, pos_label="EXON"))
print("  Recall    :", recall_score(y_true, y_pred, pos_label="EXON"))
print("  F1        :", f1_score(y_true, y_pred, pos_label="EXON"))
print()
print("  Accuracy  :", accuracy_score(y_true, y_pred))

INTRON class:
  Precision : 0.9994317705470839
  Recall    : 0.9994783224339985
  F1        : 0.9994550459484762

EXON class:
  Precision : 0.9997485112765747
  Recall    : 0.9997260630773446
  F1        : 0.9997372870509461

  Accuracy  : 0.999645481402924
