# CtesibioAI v0.0.2

### CtesibioAI comes with this file to be used in Colab, so you can train your model using the computational power of Colab

### follow the steps to do your training

* Install the requirements
* First you need to pass the data with questions and answers
* Train the model
* Test the model

In [None]:
pip install transformers datasets torch

### You need to pass the data with questions and answers

In [None]:
datas = [
    {"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
    {"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
    {"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
]

### Train the model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

special_tokens = {"pad_token": "<PAD>", "bos_token": "<BOS>", "eos_token": "<EOS>"}
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))

formatted_data = [{"text": f"<BOS>{d['pergunta']} {d['resposta']}<EOS>"} for d in datas]

dataset = Dataset.from_list(formatted_data)

def tokenize_function(example):
    encoding = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_dataset = dataset.map(tokenize_function, batched=True)


training_args = TrainingArguments(
    output_dir="./ctesibioAI-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=50,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    report_to=[],  # Evita integração com wandb ou outros sistemas
    evaluation_strategy="no",  # Desabilita avaliação durante o treinamento
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# save model
trainer.train()
model.save_pretrained("./ctesibioAI-model")
tokenizer.save_pretrained("./ctesibioAI-model")

### Test the model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./ctesibioAI-model")
tokenizer = GPT2Tokenizer.from_pretrained("./ctesibioAI-model")

tokenizer.pad_token = "<PAD>"
tokenizer.bos_token = "<BOS>"
tokenizer.eos_token = "<EOS>"

input_text = "<BOS>capital do brasil?" 
inputs = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(
    inputs,
    max_length=50,
    num_return_sequences=1,
    pad_token_id=tokenizer.pad_token_id,  # Garantir consistência com o treinamento
    temperature=0.7,  # Controle de aleatoriedade
    top_k=50,         # Considerar apenas os 50 tokens mais prováveis
    repetition_penalty=2.0,  # Penalizar repetições
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Ctesibio Response:")
print(generated_text)

### Now if you want you can download the model to use wherever you want

In [None]:
"""This piece of code is only used to download the model to your machine through Colab"""

from google.colab import files
import shutil

pasta_para_zipar = '.ctesibioAI-model'
file_zip = 'ctesibioAI-model.zip' 


shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)

files.download(file_zip)