In [None]:
import pandas as pd

from llms.exin_classifier.gpt import ExInClassifierGPT
from schemas.train_params import TrainParams
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
seed = 42

In [None]:
df = pd.read_csv("ExIn-GPT.csv", keep_default_na=False)

In [None]:
llm = ExInClassifierGPT(
  checkpoint="gpt2",
  seed=seed
)

In [None]:
all_dataset = []
for data in tqdm(df.itertuples()):
	example = llm.build_input(
		sequence=data.sequence,
		target=data.target,
		organism=data.organism,
		gene=data.gene,
		before=data.flankBefore,
		after=data.flankAfter
	)
	all_dataset.append(example)

train_dataset, test_dataset = train_test_split(
    all_dataset,
    test_size=0.05,
    random_state=seed,
    shuffle=True
)


In [None]:
llm.train(
  dataset=train_dataset,
  params=TrainParams(
    epochs=1,
    batch_size=1,
    gradient_accumulation=1,
    lr=2e-5
	)
)

In [None]:
llm.save_pretrained("GPT2-ExInClassifier")

In [None]:
y_true = []
y_pred = []

for data in tqdm(test_dataset):
	answer = llm.generate(data)
	y_pred.append(answer)
	y_true.append(data["target"])


In [None]:
print("\nMétricas para INTRON como classe positiva:")
print("  Precisão :", precision_score(y_true, y_pred, pos_label="INTRON"))
print("  Recall   :", recall_score(y_true, y_pred, pos_label="INTRON"))
print("  F1       :", f1_score(y_true, y_pred, pos_label="INTRON"))
print("  Acurácia :", accuracy_score(y_true, y_pred))

print("Métricas para EXON como classe positiva:")
print("  Precisão :", precision_score(y_true, y_pred, pos_label="EXON"))
print("  Recall   :", recall_score(y_true, y_pred, pos_label="EXON"))
print("  F1       :", f1_score(y_true, y_pred, pos_label="EXON"))
print("  Acurácia :", accuracy_score(y_true, y_pred))