# Configurações iniciais

In [None]:
!pip install datasets peft transformers trl pandas

In [None]:
pip freeze > requirements2.txt

SyntaxError: invalid syntax (3384738752.py, line 1)

In [None]:
from datasets import Dataset
from peft import LoraConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import pandas as pd
from transformers import GenerationConfig
from time import perf_counter

In [None]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model = "tinyllama-papagaio-v10"

def formatted_train(input, response):
    return f"<|user|>\n{input}</s>\n<|assistant|>\n{response}</s>"

df = pd.read_csv('livoxdataset.csv')

def prepare_train_data(df):
    df = df.drop(columns=['instruction'])
    df['output'] = df['output'].apply(lambda x: x.replace('\n', ' '))
    df["text"] = df.apply(lambda row: f"<|user|>\n{row['input']}</s>\n<|assistant|>\n{row['output']}", axis=1)
    dataset = Dataset.from_pandas(df)
    return dataset

data = prepare_train_data(df)

def get_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    return model, tokenizer

model, tokenizer = get_model_and_tokenizer(model_id)

peft_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.07, bias="none", task_type="CAUSAL_LM"
)

training_arguments = TrainingArguments(
    output_dir=output_model,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    learning_rate=1e-3,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=3,
    max_steps=1200,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=2048
)

In [None]:
trainer.train()

# Obtenção de Resultados

In [None]:

def generate_response(user_input):
    prompt = formatted_train(user_input)
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    generation_config = GenerationConfig(penalty_alpha=0.9, do_sample=True,
                                      top_k=10, temperature=0.45, repetition_penalty=1.2,
                                      max_new_tokens=100, pad_token_id=tokenizer.eos_token_id
                                      )
    start_time = perf_counter()
    outputs = model.generate(**inputs, generation_config=generation_config)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    output_time = perf_counter() - start_time
    print(f"Time taken for inference: {round(output_time, 2)} seconds")

def generate_responses(df):
    outputs = []
    for row in df.iterrows():
        prompt = formatted_train(row['input'])
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        generation_config = GenerationConfig(penalty_alpha=0.9, do_sample=True,
                                          top_k=10, temperature=0.45, repetition_penalty=1.2,
                                          max_new_tokens=100, pad_token_id=tokenizer.eos_token_id
                                          )
        outputs.append(tokenizer.decode(model.generate(**inputs, generation_config=generation_config)[0], skip_special_tokens=True))
    df['output_model_without_lora'] = outputs
    return df

df = pd.read_csv('livoxdataset.csv')

df = generate_responses(df)

df.to_csv('livoxdataset_without_lora_outputs.csv', index=False)

model_path = "tinyllama-papagaio-v10/checkpoint-1200"
peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")
model = peft_model.merge_and_unload()

df = generate_responses(df)

df.to_csv('livoxdataset_with_model_lora_outputs.csv', index=False)

# Obtenção de Métricas

# BLEU

In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu


outputs_without_lora = pd.read_csv('livoxdataset_without_lora_outputs.csv')
outputs_with_lora = pd.read_csv('livoxdataset_with_model_lora_outputs.csv')
livoxdataset = pd.read_csv('livoxdataset.csv')

def calculate_bleu(reference, hypothesis):
    return sentence_bleu([reference], hypothesis)

outputs_without_lora['BLEU_without_LORA'] = outputs_without_lora.apply(lambda row: calculate_bleu(livoxdataset.loc[row.name, 'output'], row['output_model_without_lora']), axis=1)

outputs_with_lora['BLEU_with_LORA'] = outputs_with_lora.apply(lambda row: calculate_bleu(livoxdataset.loc[row.name, 'output'], row['output_model_with_lora']), axis=1)

outputs_without_lora.to_csv('outputs_without_lora.csv', index=False)
outputs_with_lora.to_csv('outputs_with_lora.csv', index=False)

media_com_lora = outputs_with_lora['BLEU_with_LORA'].mean()
media_sem_lora = outputs_without_lora['BLEU_without_LORA'].mean()

print(f"Média com LORA: {media_com_lora:.4f}")
print(f"Média sem LORA: {media_sem_lora:.4f}")

# TER

In [None]:
from nltk.translate.meteor_score import meteor_score
import pandas as pd 
from sacrebleu.metrics import TER

outputs_without_lora = pd.read_csv('livoxdataset_without_lora_outputs.csv')
outputs_with_lora = pd.read_csv('livoxdataset_with_lora_outputs.csv')
livoxdataset = pd.read_csv('livoxdataset.csv')

ter_metric = TER()

def calculate_ter(reference, hypothesis):
    return ter_metric.sentence_score(hypothesis, [reference]).score

outputs_without_lora['TER_without_LORA'] = outputs_without_lora.apply(
    lambda row: calculate_ter(livoxdataset.loc[row.name, 'output'], row['output_model_without_lora']), axis=1)

outputs_with_lora['TER_with_LORA'] = outputs_with_lora.apply(
    lambda row: calculate_ter(livoxdataset.loc[row.name, 'output'], row['output_model_with_lora']), axis=1)

media_ter_com_lora = outputs_with_lora['TER_with_LORA'].mean()
media_ter_sem_lora = outputs_without_lora['TER_without_LORA'].mean()

print(f"Média TER com LORA: {media_ter_com_lora:.4f}")
print(f"Média TER sem LORA: {media_ter_sem_lora:.4f}")