# Fine-tuning controlado de RoBERTa (GT vs Consenso LLM)

## 1) Imports e setup

In [1]:
import pandas as pd
from transformers import TrainingArguments
from datasets import Dataset
from pathlib import Path

import sys
from loguru import logger
import pandas as pd

logger.remove()
logger.add(
    sys.stdout,
    format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
    level="INFO"
)

logger.success("✓ Setup completo")

# loaders
from src.utils.data_loader import load_hf_dataset_as_dataframe

# fine-tuning system
from src.fine_tune_system.fine_tune.supervised_fine_tuner import SupervisedFineTuner
from src.fine_tune_system.core.hf_tokenizer import HFTokenizer
from src.fine_tune_system.core.model_factory import ModelFactory
from src.fine_tune_system.training.trainer_builder import TrainerBuilder
from src.fine_tune_system.training.metrics import MetricsComputer
from src.fine_tune_system.training.label_schema import LabelSchema
from src.utils.dataset_aligner import DatasetAligner

[32m13:55:00[0m | [32m[1mSUCCESS [0m | [32m[1m✓ Setup completo[0m


## 2) Configuração do experimento

### - Configurações de dataset

In [2]:
from src.api.schemas.dataset import DatasetConfig

DATASET_GLOBAL_CONFIG_TRAIN = DatasetConfig(
    split="train",
    combine_splits=[],
    sample_size=None,
    random_state=42,
)

DATASET_GLOBAL_CONFIG_EVAL = DatasetConfig(
    split="test",
    combine_splits=[],
    sample_size=5000,
    random_state=42,
)

### - Configurações de cache

In [3]:
CACHE_ENABLED = True
CACHE_DIR = "C:\\Users\\gabri\\Documents\\GitHub\\llm-annotation\\data\\.cache"

### - Resultados

In [4]:
RESULTS_DIR = "C:\\Users\\gabri\\Documents\\GitHub\\llm-annotation\\data\\results"

In [5]:
DATASET_NAME = "sst1"  
SPECIFIC_DATE = "2025-12-27"

RESULTS_DIR = Path(RESULTS_DIR)
results_dataset_path = RESULTS_DIR.joinpath(DATASET_NAME, SPECIFIC_DATE)

## 3) Carregar Dados

### - Carregando

In [6]:
df = pd.read_csv(results_dataset_path.joinpath("summary", "dataset_anotado_completo.csv"))

print(f"Anotado: {len(df)}")

Anotado: 5000


### - Dataset ground_truth

In [7]:
df_gt_train = (
    df[["text_id", "text", "ground_truth"]]
    .rename(columns={"ground_truth": "label"})
)

In [8]:
from src.utils.data_loader import add_label_description

df_gt_train = add_label_description(
    df_gt_train,
    dataset_name=DATASET_NAME
)

In [9]:
label_schema = LabelSchema.from_dataframe(df_gt_train)
print(label_schema.id2label)

{0: 'very negative sentiment', 1: 'negative sentiment', 2: 'neutral sentiment', 3: 'positive sentiment', 4: 'very positive sentiment'}


In [10]:
df_gt_train

Unnamed: 0,text_id,text,label,label_description
0,0,impostor has a handful of thrilling moments an...,1,negative sentiment
1,1,the acting in pauline and paulette is good all...,4,very positive sentiment
2,2,"mr. polanski is in his element here : alone , ...",3,positive sentiment
3,3,a cop story that understands the medium amazin...,3,positive sentiment
4,4,the most horrific movie experience i 've had s...,0,very negative sentiment
...,...,...,...,...
4995,4995,remember it .\n,3,positive sentiment
4996,4996,-lrb- woo 's -rrb- most resonant film since th...,3,positive sentiment
4997,4997,it 's a movie that accomplishes so much that o...,3,positive sentiment
4998,4998,a different movie -- sometimes tedious -- by a...,3,positive sentiment


### - Dataset anotado

In [11]:
df_annotations = (
    df[["text_id", "text", "most_common_annotation"]]
    .rename(columns={"most_common_annotation": "label"})
)

In [12]:
df_annotations = add_label_description(
    df_annotations,
    dataset_name=DATASET_NAME
)

In [13]:
df_annotations

Unnamed: 0,text_id,text,label,label_description
0,0,impostor has a handful of thrilling moments an...,1,negative sentiment
1,1,the acting in pauline and paulette is good all...,3,positive sentiment
2,2,"mr. polanski is in his element here : alone , ...",3,positive sentiment
3,3,a cop story that understands the medium amazin...,3,positive sentiment
4,4,the most horrific movie experience i 've had s...,1,negative sentiment
...,...,...,...,...
4995,4995,remember it .\n,2,neutral sentiment
4996,4996,-lrb- woo 's -rrb- most resonant film since th...,3,positive sentiment
4997,4997,it 's a movie that accomplishes so much that o...,3,positive sentiment
4998,4998,a different movie -- sometimes tedious -- by a...,1,negative sentiment


### - Removendo instancias problemáticas

In [14]:
df_problematic = pd.read_csv(results_dataset_path.joinpath("consensus", "problematic_cases.csv"))

df_problematic

Unnamed: 0,text_id,text,consensus_score,annotations,entropy
0,5,"massoud 's story is an epic , but also a trage...",0.4,"{1: 2, 3: 1, 4: 1, 2: 1}",1.921928
1,9,if you saw benigni 's pinocchio at a public pa...,0.4,"{0: 2, 1: 2, 4: 1}",1.521928
2,12,meant to reduce blake 's philosophy into a tra...,0.4,"{1: 2, 2: 2, 4: 1}",1.521928
3,16,if the predictability of bland comfort food ap...,0.4,"{3: 2, 2: 2, 4: 1}",1.521928
4,17,what they 're doing is a matter of plumbing ar...,0.4,"{1: 2, 2: 2, 4: 1}",1.521928
...,...,...,...,...,...
1024,4961,"an intelligent , multi-layered and profoundly ...",0.4,"{4: 2, 3: 2, 2: 1}",1.521928
1025,4972,the best part about `` gangs '' was daniel day...,0.4,"{3: 2, 1: 2, 4: 1}",1.521928
1026,4975,any one episode of the sopranos would send thi...,0.4,"{0: 2, 1: 2, 4: 1}",1.521928
1027,4987,strange occurrences build in the mind of the v...,0.4,"{0: 1, 1: 2, 4: 1, 2: 1}",1.921928


In [15]:
df_annotations = df_annotations[
    ~df_annotations["text_id"].isin(df_problematic["text_id"])
].reset_index(drop=True)

In [16]:
df_annotations

Unnamed: 0,text_id,text,label,label_description
0,0,impostor has a handful of thrilling moments an...,1,negative sentiment
1,1,the acting in pauline and paulette is good all...,3,positive sentiment
2,2,"mr. polanski is in his element here : alone , ...",3,positive sentiment
3,3,a cop story that understands the medium amazin...,3,positive sentiment
4,4,the most horrific movie experience i 've had s...,1,negative sentiment
...,...,...,...,...
3966,4995,remember it .\n,2,neutral sentiment
3967,4996,-lrb- woo 's -rrb- most resonant film since th...,3,positive sentiment
3968,4997,it 's a movie that accomplishes so much that o...,3,positive sentiment
3969,4998,a different movie -- sometimes tedious -- by a...,1,negative sentiment


### - Carrega dataset de teste e validação 

In [17]:
df_eval, _ = load_hf_dataset_as_dataframe(
    dataset_name=DATASET_NAME,
    cache_dir=CACHE_DIR,
    dataset_global_config=DATASET_GLOBAL_CONFIG_EVAL,
)

print(f"Avaliação HF: {len(df_eval)}")

[32m13:55:06[0m | [1mCarregando dataset: sst1[0m


Downloading readme:   0%|          | 0.00/965 [00:00<?, ?B/s]

[32m13:55:12[0m | [1mSplit 'test': 11855 exemplos[0m
[32m13:55:12[0m | [1mCategorias extraídas automaticamente: [0, 1, 2, 3, 4][0m
[32m13:55:12[0m | [1mAmostra reduzida para 5000 exemplos (seed=42)[0m
[32m13:55:12[0m | [1mColuna de texto: text[0m
[32m13:55:13[0m | [1mGround truth carregado da coluna 'label'[0m
[32m13:55:13[0m | [1mDataFrame criado com 5000 linhas[0m


Avaliação HF: 5000


In [18]:
from sklearn.model_selection import train_test_split

df_gt_val, df_gt_test = train_test_split(
    df_eval,
    test_size=0.5,
    random_state=42,
    stratify=df_eval["label"]
)

In [19]:
df_gt_val

Unnamed: 0,text,label,label_description
173,cox is far more concerned with aggrandizing ma...,2,neutral sentiment
39,"much of the lady and the duke is about quiet ,...",3,positive sentiment
1099,victor rosa is leguizamo 's best movie work so...,4,very positive sentiment
4458,"the year 's happiest surprise , a movie that d...",4,very positive sentiment
874,"the irwins emerge unscathed , but the fictiona...",0,very negative sentiment
...,...,...,...
1669,both deeply weird and charmingly dear .\n,3,positive sentiment
4049,it 's a testament to de niro and director mich...,3,positive sentiment
486,without the dark spookiness of crystal lake ca...,2,neutral sentiment
2561,everything is pegged into the groove of a new ...,2,neutral sentiment


In [20]:
df_gt_test

Unnamed: 0,text,label,label_description
24,not so much a movie as a picture book for the ...,1,negative sentiment
3023,adolescents will be adequately served by the m...,3,positive sentiment
1136,tsai ming-liang 's ghosts are painfully aware ...,2,neutral sentiment
4956,where tom green stages his gags as assaults on...,1,negative sentiment
4571,"the word that comes to mind , while watching e...",3,positive sentiment
...,...,...,...
1284,labute 's careful handling makes the material ...,4,very positive sentiment
4996,"the town has kind of an authentic feel , but e...",1,negative sentiment
2967,"thanks to ice cube , benjamins feels an awful ...",2,neutral sentiment
235,i have a confession to make : i did n't partic...,1,negative sentiment


## 5) Converter para HuggingFace Dataset

In [21]:
from datasets import Dataset

train_gt = Dataset.from_pandas(df_gt_train)
train_consensus = Dataset.from_pandas(df_annotations)
test_dataset = Dataset.from_pandas(df_gt_test)
eval_dataset = Dataset.from_pandas(df_gt_val)

## 6) Fine-tuning 

In [22]:
model = "roberta-base"

In [23]:
fine_tune_output_dir = results_dataset_path.joinpath("finetuning")

Path.mkdir(fine_tune_output_dir, parents=True, exist_ok=True)

training_args = TrainingArguments(
    output_dir=fine_tune_output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=5e-5,
    num_train_epochs=20,

    per_device_train_batch_size=16,   
    per_device_eval_batch_size=32,

    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    logging_strategy="epoch",
    save_total_limit=1,

    seed=42,
)

In [24]:
def run_fine_tuning(model, train_dataset, test_dataset, eval_dataset = None, experiment_name: str = "default"):
    fine_tuner = SupervisedFineTuner(
        model_name=model,
        training_args=training_args,
        label_schema=label_schema,
        tokenizer=HFTokenizer(
            model_name=model,
            max_length=256
        ),
        model_factory=ModelFactory,
        trainer_builder=TrainerBuilder,
        metrics_computer=MetricsComputer(),
    )

    fine_tuner.fit(train_dataset, eval_dataset)
    metrics = fine_tuner.evaluate(test_dataset)

    metrics["source"] = experiment_name

    return metrics

In [25]:
metrics_consensus = run_fine_tuning(
    model=model,
    train_dataset=train_consensus,
    test_dataset=test_dataset,
    eval_dataset=eval_dataset,
    experiment_name="consensus_llm",
)

Map:   0%|          | 0/3971 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.7486,2.0167,0.4304,0.256347
2,0.4782,2.376189,0.4444,0.289095
3,0.3425,2.767248,0.4644,0.303403
4,0.2845,2.750978,0.4604,0.289646
5,0.2092,3.31691,0.45,0.291615
6,0.1372,3.800683,0.454,0.309901
7,0.0855,4.075778,0.4724,0.317644
8,0.0683,4.279271,0.4728,0.317395
9,0.0574,4.601912,0.4584,0.298529
10,0.0391,4.527915,0.4764,0.324636


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [26]:
metrics_gt = run_fine_tuning(
    model=model,
    train_dataset=train_gt,
    test_dataset=test_dataset,
    eval_dataset=eval_dataset,
    experiment_name="ground_truth",
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.5826,1.571604,0.2608,0.082741
2,1.5769,1.574779,0.2672,0.084343
3,1.5765,1.575871,0.2672,0.084343
4,1.5401,1.505087,0.3264,0.182228
5,1.5608,1.579022,0.2672,0.084343
6,1.5721,1.579028,0.2608,0.082741
7,1.5713,1.574764,0.2672,0.084343
8,1.5701,1.57349,0.2608,0.082741
9,1.5715,1.581846,0.2672,0.084343
10,1.5705,1.572051,0.2608,0.082741


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [27]:
results = pd.DataFrame([metrics_consensus, metrics_gt])
results

Unnamed: 0,eval_loss,eval_accuracy,eval_f1_macro,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,source
0,5.052509,0.4608,0.299412,18.7065,133.643,4.223,20.0,consensus_llm
1,1.461705,0.3624,0.247437,18.6866,133.785,4.228,20.0,ground_truth


In [28]:
results.to_csv(fine_tune_output_dir.joinpath(f"{model}_fine_tuning_results.csv"), index=False)