# Treinando apenas com os íntrons e éxons

In [1]:
import pickle
import random

import numpy as np
from datasets import Dataset
from transformers import (GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments)

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
file = open("./database/col_ac.mod1", "rb")
data = pickle.load(file)

database = data["train"] + data["test"]

In [3]:
introns_data = []
exons_data = []

for sequence in database:
	introns = sequence["introns"]
	exons = sequence["exons"]

	for intron in introns:
		introns_data.append(intron["data"])

	for exon in exons:
		exons_data.append(exon["data"])

introns_data = set(introns_data)
exons_data = set(exons_data)

In [4]:
transformers_input = []

for sequence in introns_data:
  transformers_input.append({
    "prompt": f"what is the classification for this sequence? {sequence}",
    "completion": "[INTRON]"
  })

for sequence in exons_data:
  transformers_input.append({
    "prompt": f"what is the classification for this sequence? {sequence}",
    "completion": "[EXON]"
  })

random.shuffle(transformers_input)

In [5]:
train_proportion = 0.8
dataset_len = len(transformers_input)
crop = int(train_proportion * dataset_len)

train = transformers_input[:crop]
test = transformers_input[crop:]

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
special_tokens = ["A", "C", "G", "T", "[EXON]", "[INTRON]"]
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

In [8]:
sequence_max_lengths = [len(tokenizer(seq["prompt"])["input_ids"]) for seq in transformers_input]

In [9]:
crop_length = int(np.percentile(sequence_max_lengths, 95))

In [None]:
print(f"Lenght for the sequences crop: {crop_length}")

In [11]:
hf_train = Dataset.from_list(train)
hf_test = Dataset.from_list(test)

In [6]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def to_tokens(example):
  inputs = tokenizer(example["prompt"], truncation=True, padding="max_length", max_length=crop_length)
  outputs = tokenizer(example["completion"], truncation=True, padding="max_length", max_length=crop_length)
  inputs["labels"] = outputs["input_ids"]
  return inputs

tokenized_train = hf_train.map(to_tokens, batched=True, batch_size=32)
tokenized_test = hf_test.map(to_tokens, batched=True, batch_size=32)

In [None]:
training_args = TrainingArguments(
  output_dir="./results",
  evaluation_strategy="epoch",
  learning_rate=5e-5,
  num_train_epochs=3,
  per_device_train_batch_size=4,
  save_steps=10,
  save_total_limit=2
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_test,
  tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("spliceGPT")
tokenizer.save_pretrained("spliceGPT")

In [3]:
subject = 15
prompt = "Write a story about a dragon who learns to fly."

In [7]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
attention_mask = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).attention_mask

In [11]:
output = model.generate(
    input_ids,
    max_length=200,
    num_beams=5,            # Usando busca por feixe para melhor desempenho
    temperature=0.5,        # Experimente uma temperatura maior
    top_k=50,               # Considera apenas os 50 tokens mais prováveis
    top_p=0.95,             # Aplica amostragem com probabilidade acumulada
    do_sample=False,         # Amostragem ativa
    no_repeat_ngram_size=2  # Evita repetições de n-grams
)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [12]:
generated_sequence = tokenizer.decode(output[0], skip_special_tokens=True).strip()
print("Response:", generated_sequence)

Response: Write a story about a dragon who learns to fly.

This is the story of a young girl who discovers that she can fly, but she has to learn how to do it in order to make it to the next level. The story begins with the girl's first encounter with a flying dragon, and it's up to her to find out what it is that makes her so special.
