In [2]:
from sklearn.model_selection import KFold
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import pandas as pd

In [29]:
dataset = load_dataset(
    'csv',
    data_files={'train': './datas/training_text_personal_data.csv'},
    delimiter=';',
    )


Generating train split: 52 examples [00:00, 642.26 examples/s]


In [30]:
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def preprocess(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=512)

In [32]:
tokenized_dataset = dataset['train'].map(preprocess, batched=True)


[A

Map: 100%|██████████| 52/52 [00:00<00:00, 771.43 examples/s]


In [33]:
print(type(tokenized_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [34]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [35]:
results = []

In [36]:
for fold, (train_index, test_index) in enumerate(kf.split(tokenized_dataset)):
    print(f"Treinando o Fold {fold + 1}")
    
    train_data = tokenized_dataset.select(train_index)
    test_data = tokenized_dataset.select(test_index)
    
    training_args = TrainingArguments(
        output_dir=f"./datas/results_fold_{fold + 1}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        save_total_limit=1,
        logging_dir=f"./logs_fold_{fold + 1}",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
    )
    
    trainer.train()
    
    eval_result = trainer.evaluate()
    results.append(eval_result)
    print(f"Resultados para o Fold {fold + 1}: {eval_result}")

Treinando o Fold 1


  trainer = Trainer(
  0%|          | 0/18 [22:30<?, ?it/s]
 33%|███▎      | 6/18 [01:49<02:50, 14.22s/it]
[A
[A

[A[A                                       
                                              
 33%|███▎      | 6/18 [01:56<02:50, 14.22s/it]
[A

{'eval_loss': 1.0028258562088013, 'eval_runtime': 6.4798, 'eval_samples_per_second': 1.698, 'eval_steps_per_second': 0.309, 'epoch': 1.0}


 67%|██████▋   | 12/18 [03:33<01:22, 13.72s/it]
[A
[A

[A[A                                       
                                               
 67%|██████▋   | 12/18 [03:39<01:22, 13.72s/it]
[A

{'eval_loss': 0.9608098864555359, 'eval_runtime': 6.4826, 'eval_samples_per_second': 1.697, 'eval_steps_per_second': 0.309, 'epoch': 2.0}


100%|██████████| 18/18 [05:17<00:00, 14.10s/it]
[A
[A

[A[A                                       
                                               
100%|██████████| 18/18 [05:28<00:00, 14.10s/it]
[A
100%|██████████| 18/18 [05:28<00:00, 18.26s/it]


{'eval_loss': 0.9470769762992859, 'eval_runtime': 6.1459, 'eval_samples_per_second': 1.79, 'eval_steps_per_second': 0.325, 'epoch': 3.0}
{'train_runtime': 328.7599, 'train_samples_per_second': 0.374, 'train_steps_per_second': 0.055, 'train_loss': 1.0253802405463324, 'epoch': 3.0}


100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
  trainer = Trainer(


Resultados para o Fold 1: {'eval_loss': 0.9470769762992859, 'eval_runtime': 6.2071, 'eval_samples_per_second': 1.772, 'eval_steps_per_second': 0.322, 'epoch': 3.0}
Treinando o Fold 2


 33%|███▎      | 6/18 [01:39<02:42, 13.54s/it]
[A
[A

[A[A                                       
                                              
 33%|███▎      | 6/18 [01:46<02:42, 13.54s/it]
[A

{'eval_loss': 0.8630594611167908, 'eval_runtime': 6.4214, 'eval_samples_per_second': 1.713, 'eval_steps_per_second': 0.311, 'epoch': 1.0}


 67%|██████▋   | 12/18 [03:26<01:21, 13.61s/it]
[A
[A

[A[A                                       
                                               
 67%|██████▋   | 12/18 [03:32<01:21, 13.61s/it]
[A

{'eval_loss': 0.8196222186088562, 'eval_runtime': 6.5194, 'eval_samples_per_second': 1.687, 'eval_steps_per_second': 0.307, 'epoch': 2.0}


100%|██████████| 18/18 [05:07<00:00, 13.36s/it]
[A
[A

[A[A                                       
                                               
100%|██████████| 18/18 [05:16<00:00, 13.36s/it]
[A
100%|██████████| 18/18 [05:16<00:00, 17.57s/it]


{'eval_loss': 0.8099384307861328, 'eval_runtime': 5.9974, 'eval_samples_per_second': 1.834, 'eval_steps_per_second': 0.333, 'epoch': 3.0}
{'train_runtime': 316.1875, 'train_samples_per_second': 0.389, 'train_steps_per_second': 0.057, 'train_loss': 0.8562533060709635, 'epoch': 3.0}


100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
  trainer = Trainer(


Resultados para o Fold 2: {'eval_loss': 0.8099384307861328, 'eval_runtime': 6.947, 'eval_samples_per_second': 1.583, 'eval_steps_per_second': 0.288, 'epoch': 3.0}
Treinando o Fold 3


 33%|███▎      | 6/18 [01:44<02:53, 14.48s/it]
[A
[A

[A[A                                       
                                              
 33%|███▎      | 6/18 [01:51<02:53, 14.48s/it]
[A

{'eval_loss': 0.61440509557724, 'eval_runtime': 6.3485, 'eval_samples_per_second': 1.575, 'eval_steps_per_second': 0.315, 'epoch': 1.0}


 67%|██████▋   | 12/18 [03:28<01:25, 14.17s/it]
[A
[A

[A[A                                       
                                               
 67%|██████▋   | 12/18 [03:34<01:25, 14.17s/it]
[A

{'eval_loss': 0.5808447003364563, 'eval_runtime': 6.0561, 'eval_samples_per_second': 1.651, 'eval_steps_per_second': 0.33, 'epoch': 2.0}


100%|██████████| 18/18 [05:10<00:00, 14.09s/it]
[A
[A

[A[A                                       
                                               
100%|██████████| 18/18 [05:18<00:00, 14.09s/it]
[A
100%|██████████| 18/18 [05:18<00:00, 17.69s/it]


{'eval_loss': 0.5600932240486145, 'eval_runtime': 5.8576, 'eval_samples_per_second': 1.707, 'eval_steps_per_second': 0.341, 'epoch': 3.0}
{'train_runtime': 318.4166, 'train_samples_per_second': 0.396, 'train_steps_per_second': 0.057, 'train_loss': 0.6745323075188531, 'epoch': 3.0}


100%|██████████| 2/2 [00:01<00:00,  1.78it/s]


Resultados para o Fold 3: {'eval_loss': 0.5600932240486145, 'eval_runtime': 5.6719, 'eval_samples_per_second': 1.763, 'eval_steps_per_second': 0.353, 'epoch': 3.0}
Treinando o Fold 4


  trainer = Trainer(
 33%|███▎      | 6/18 [01:40<02:50, 14.25s/it]
[A
[A

[A[A                                       
                                              
 33%|███▎      | 6/18 [01:46<02:50, 14.25s/it]
[A

{'eval_loss': 0.525206446647644, 'eval_runtime': 5.7654, 'eval_samples_per_second': 1.734, 'eval_steps_per_second': 0.347, 'epoch': 1.0}


 67%|██████▋   | 12/18 [03:23<01:25, 14.30s/it]
[A
[A

[A[A                                       
                                               
 67%|██████▋   | 12/18 [03:29<01:25, 14.30s/it]
[A

{'eval_loss': 0.49075180292129517, 'eval_runtime': 5.4566, 'eval_samples_per_second': 1.833, 'eval_steps_per_second': 0.367, 'epoch': 2.0}


100%|██████████| 18/18 [05:12<00:00, 15.09s/it]
[A
[A

[A[A                                       
                                               
100%|██████████| 18/18 [05:22<00:00, 15.09s/it]
[A
100%|██████████| 18/18 [05:22<00:00, 17.91s/it]


{'eval_loss': 0.47197189927101135, 'eval_runtime': 4.8958, 'eval_samples_per_second': 2.043, 'eval_steps_per_second': 0.409, 'epoch': 3.0}
{'train_runtime': 322.4589, 'train_samples_per_second': 0.391, 'train_steps_per_second': 0.056, 'train_loss': 0.4196605682373047, 'epoch': 3.0}


100%|██████████| 2/2 [00:01<00:00,  1.94it/s]
  trainer = Trainer(


Resultados para o Fold 4: {'eval_loss': 0.47197189927101135, 'eval_runtime': 5.45, 'eval_samples_per_second': 1.835, 'eval_steps_per_second': 0.367, 'epoch': 3.0}
Treinando o Fold 5


 33%|███▎      | 6/18 [01:30<02:32, 12.75s/it]
[A
[A

[A[A                                       
                                              
 33%|███▎      | 6/18 [01:36<02:32, 12.75s/it]
[A

{'eval_loss': 0.251351535320282, 'eval_runtime': 5.563, 'eval_samples_per_second': 1.798, 'eval_steps_per_second': 0.36, 'epoch': 1.0}


 67%|██████▋   | 12/18 [03:07<01:18, 13.15s/it]
[A
[A

[A[A                                       
                                               
 67%|██████▋   | 12/18 [03:12<01:18, 13.15s/it]
[A

{'eval_loss': 0.22464656829833984, 'eval_runtime': 5.2799, 'eval_samples_per_second': 1.894, 'eval_steps_per_second': 0.379, 'epoch': 2.0}


100%|██████████| 18/18 [04:51<00:00, 14.00s/it]
[A
[A

[A[A                                       
                                               
100%|██████████| 18/18 [05:00<00:00, 14.00s/it]
[A
100%|██████████| 18/18 [05:00<00:00, 16.70s/it]


{'eval_loss': 0.20889540016651154, 'eval_runtime': 4.8407, 'eval_samples_per_second': 2.066, 'eval_steps_per_second': 0.413, 'epoch': 3.0}
{'train_runtime': 300.6342, 'train_samples_per_second': 0.419, 'train_steps_per_second': 0.06, 'train_loss': 0.2668522728814019, 'epoch': 3.0}


100%|██████████| 2/2 [00:01<00:00,  1.99it/s]

Resultados para o Fold 5: {'eval_loss': 0.20889540016651154, 'eval_runtime': 5.0383, 'eval_samples_per_second': 1.985, 'eval_steps_per_second': 0.397, 'epoch': 3.0}





In [37]:
print("Resultados de Cross-Validation:", results)

Resultados de Cross-Validation: [{'eval_loss': 0.9470769762992859, 'eval_runtime': 6.2071, 'eval_samples_per_second': 1.772, 'eval_steps_per_second': 0.322, 'epoch': 3.0}, {'eval_loss': 0.8099384307861328, 'eval_runtime': 6.947, 'eval_samples_per_second': 1.583, 'eval_steps_per_second': 0.288, 'epoch': 3.0}, {'eval_loss': 0.5600932240486145, 'eval_runtime': 5.6719, 'eval_samples_per_second': 1.763, 'eval_steps_per_second': 0.353, 'epoch': 3.0}, {'eval_loss': 0.47197189927101135, 'eval_runtime': 5.45, 'eval_samples_per_second': 1.835, 'eval_steps_per_second': 0.367, 'epoch': 3.0}, {'eval_loss': 0.20889540016651154, 'eval_runtime': 5.0383, 'eval_samples_per_second': 1.985, 'eval_steps_per_second': 0.397, 'epoch': 3.0}]


In [38]:
eval_losses = [0.9471, 0.8099, 0.5601, 0.4720, 0.2089]
mean_loss = sum(eval_losses) / len(eval_losses)
print(f"Média do eval_loss: {mean_loss}")

Média do eval_loss: 0.5996


In [44]:
final_training_args = TrainingArguments(
    output_dir="./final_model",
    eval_strategy="no",  # Desativar avaliação, pois a validação cruzada já foi feita
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_total_limit=1,
    logging_dir="./final_logs",
)

In [46]:
final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=tokenized_dataset, 
    processing_class=tokenizer,
)


In [47]:
final_trainer.train()

  0%|          | 0/18 [1:08:15<?, ?it/s]
100%|██████████| 21/21 [06:28<00:00, 18.51s/it]

{'train_runtime': 388.7701, 'train_samples_per_second': 0.401, 'train_steps_per_second': 0.054, 'train_loss': 0.12189194134303502, 'epoch': 3.0}





TrainOutput(global_step=21, training_loss=0.12189194134303502, metrics={'train_runtime': 388.7701, 'train_samples_per_second': 0.401, 'train_steps_per_second': 0.054, 'total_flos': 41045693165568.0, 'train_loss': 0.12189194134303502, 'epoch': 3.0})

In [48]:
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

print("Modelo final treinado e salvo em './final_model'")

Modelo final treinado e salvo em './final_model'
