# Libraries and Imports

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"

In [None]:
import editdistance
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from llms.dna_translator.llama import DNATranslatorLLaMA
from schemas.train_params import TrainParams
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Params and Files

In [None]:
seed = 42

csv_path = "tran-4000.csv"
pretrained_model_name = "DNATranLLaMA"

csv_path = f"./storage/data/processed/{csv_path}"
output_path = f"./storage/models/tuned/{pretrained_model_name}"
checkpoint = "/storage/models/base/llama-3.1-8b-instruct"

# Reading Dataset

In [None]:
df = pd.read_csv(csv_path, keep_default_na=False)

# Loading Model

In [None]:
llm = DNATranslatorLLaMA(
  checkpoint=checkpoint,
  seed=seed
)

# Data Processing

In [None]:
data = df.to_dict(orient="records")

all_dataset = []
for record in tqdm(data):
	example = llm.build_input(
		dna_sequence=record["sequence"],
		organism=record["organism"],
		protein_sequence=record["target"]
	)
	all_dataset.append(example)

train_dataset, test_dataset = train_test_split(
	all_dataset,
	test_size=0.05,
	random_state=seed,
	shuffle=True
)

# Data Analysis

In [None]:
print("Train Dataset Len:", len(train_dataset))
print("Test Dataset Len:", len(test_dataset))

In [None]:
train_lengths = [len(example["dna_sequence"]) for example in train_dataset]
test_lengths = [len(example["dna_sequence"]) for example in test_dataset]

In [None]:
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.2)

plt.figure(figsize=(10, 6))
sns.histplot(train_lengths, kde=True, bins=40, color="skyblue", label="Train")
sns.histplot(test_lengths, kde=True, bins=40, color="salmon", label="Test")

plt.title("Sequence Length Distribution", fontsize=16, weight="bold")
plt.xlabel("Sequence Length")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()

# Training

In [None]:

llm.train(
  train_dataset=train_dataset,
  params=TrainParams(
    epochs=1,
    batch_size=1,
    gradient_accumulation=4,
    lr=4e-5,
    logging_steps=5
	)
)

# Saving The Model

In [None]:
llm.save_pretrained(output_path)

# Test

In [None]:
results = []

for data in tqdm(test_dataset):
	pred = llm.generate(data)
	target = data["protein_sequence"]

	dist = editdistance.eval(pred, target)
	similarity = 1 - dist / max(len(pred), len(target))

	results.append({
		"target": target,
		"pred": pred,
		"edit_dist": dist,
		"similarity": similarity
	})

similarities = [r["similarity"] for r in results]
mean_similarity = np.mean(similarities)
std_similarity = np.std(similarities)

print(f"Mean similarity: {mean_similarity:.4f} ± {std_similarity:.4f}")

pd.DataFrame(results).to_csv("output.csv")

In [None]:
similarities = [r["similarity"] for r in results]
mean_similarity = np.mean(similarities)
std_similarity = np.std(similarities)

print(f"Mean similarity: {mean_similarity:.4f} ± {std_similarity:.4f}")