# Importando dados processados

In [1]:
import pickle 
import datasets
import pandas as pd
import os
import torch

In [2]:
datasets.__version__

'4.0.0'

In [3]:
with open("./my_data.pickle", "rb") as file:
    split_train = pickle.load(file)
    split_eval = pickle.load(file)
    split_test = pickle.load(file)

In [4]:
train_df = split_train.to_pandas()
eval_df = split_eval.to_pandas()
test_df = split_test.to_pandas()

# Exploração dos dados

In [5]:
train_df.columns

Index(['hash_id', 'tema_id', 'tema', 'palavras_chave', 'uf_tema_info',
       'uf_pesquisador', 'nome_programa', 'sigla_entidade_ensino',
       'nome_producao', 'nome_subtipo_producao', 'nome_area_concentracao',
       'nome_linha_pesquisa', 'nome_projeto', 'descricao_palavra_chave',
       'descricao_resumo', 'descricao_abstract', 'descricao_keyword',
       'data_titulacao', 'nome_grau_academico',
       'nome_grande_area_conhecimento', 'nome_area_conhecimento',
       'nome_subarea_conhecimento', 'nome_especialidade', 'modelo_nivel',
       'modelo_explicacao', '__index_level_0__'],
      dtype='object')

In [6]:
train_df['tema'].nunique(), len(train_df)

(437, 33620)

In [7]:
train_df['modelo_nivel'].value_counts()/len(train_df)

modelo_nivel
BAIXA    0.478614
MEDIA    0.308418
ALTA     0.212968
Name: count, dtype: float64

In [8]:
# TODO: remover
# train_df = train_df.sample(5000, random_state=2025) # para teste

# Preparando inputs para o ModernBert

In [9]:
train_df = train_df.dropna()

In [10]:
train_df['palavras_chave'] = train_df['palavras_chave'].apply(lambda array: ", ".join(array.tolist()))
eval_df['palavras_chave'] = eval_df['palavras_chave'].apply(lambda array: ", ".join(array.tolist()))
test_df['palavras_chave'] = test_df['palavras_chave'].apply(lambda array: ", ".join(array.tolist()))

In [11]:
# input_cols = ['nome_producao', 'nome_area_concentracao', 'nome_linha_pesquisa', 'nome_projeto', 'descricao_palavra_chave', 'descricao_resumo', 'tema', 'palavras_chave']
input_cols = ['nome_projeto', 'descricao_palavra_chave', 'descricao_resumo', 'tema', 'palavras_chave']
input_cols_mapping = {col: col.replace('_', ' ').capitalize() for col in input_cols}
input_cols_mapping

{'nome_projeto': 'Nome projeto',
 'descricao_palavra_chave': 'Descricao palavra chave',
 'descricao_resumo': 'Descricao resumo',
 'tema': 'Tema',
 'palavras_chave': 'Palavras chave'}

In [12]:
def create_input_from_row(row: pd.Series) -> str:
  input_str = ""

  for col, formated_name in input_cols_mapping.items():
    input_str += f"{formated_name}: {row[col].capitalize()}\n"

  return input_str

In [13]:
train_df["bert_raw_inputs"] = [create_input_from_row(row) for _, row in train_df.iterrows()]
eval_df["bert_raw_inputs"] = [create_input_from_row(row) for _, row in eval_df.iterrows()]
test_df["bert_raw_inputs"] = [create_input_from_row(row) for _, row in test_df.iterrows()]

# Splits de treino e teste

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoder = LabelEncoder()
train_df['modelo_nivel'] = label_encoder.fit_transform(train_df['modelo_nivel'])
eval_df['modelo_nivel'] = label_encoder.transform(eval_df['modelo_nivel'])
test_df['modelo_nivel'] = label_encoder.transform(test_df['modelo_nivel'])


# Hugginface

In [16]:
from datasets import Dataset
from transformers import AutoTokenizer, ModernBertForSequenceClassification
from transformers import DataCollatorWithPadding

In [17]:
train_ds = Dataset.from_dict(
    {
        "text": train_df['bert_raw_inputs'].tolist(),
        "label": train_df['modelo_nivel'].tolist()
    }
)

eval_ds = Dataset.from_dict(
    {
        "text": eval_df['bert_raw_inputs'].tolist(),
        "label": eval_df['modelo_nivel'].tolist()
    }
)

test_ds = Dataset.from_dict(
    {
        "text": test_df['bert_raw_inputs'].tolist(),
        "label": test_df['modelo_nivel'].tolist()
    }
)

In [18]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
memory_before = torch.cuda.memory_allocated()
print(f"Memória usada antes de carregar modelo: {memory_before/1024**2:.2f} MB")

Memória usada antes de carregar modelo: 0.00 MB


In [19]:
MAX_LEN = 1024
FREEZE_ENCODER = False
model = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model, truncation_side='left')
model = ModernBertForSequenceClassification.from_pretrained(model, num_labels=3, max_position_embeddings=MAX_LEN).to("cuda") # talvez 2048

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Após um batch ou epoch
memory_after = torch.cuda.memory_allocated()
print(f"Memória atualmente alocada: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"Máxima memória usada até agora: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
print(f"Memória usada pelo modelo: {(memory_after-memory_before)/1024**2:.2f} MB")

Memória atualmente alocada: 590.40 MB
Máxima memória usada até agora: 590.40 MB
Memória usada pelo modelo: 590.40 MB


In [21]:
model.device

device(type='cuda', index=0)

In [22]:
model

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [23]:
if FREEZE_ENCODER:
    print(f"- Freezing encoder layers")
    for param in model.model.parameters():
        param.requires_grad = False

# Analisando número de tokens no conjunto de dados inteiro

In [24]:
def get_num_tokens(text: str) -> int:
  return len(tokenizer.encode(text))

In [25]:
sampled_df = train_df.sample(1000, random_state=2025)
# sampled_df = dados_df

In [26]:
sampled_df["num_tokens"] = sampled_df["bert_raw_inputs"].apply(get_num_tokens)

In [27]:
sampled_df["num_tokens"].describe()

count    1000.000000
mean      910.950000
std       270.799701
min       241.000000
25%       731.750000
50%       876.000000
75%      1068.500000
max      4304.000000
Name: num_tokens, dtype: float64

In [28]:
sampled_df["num_tokens"].describe()

count    1000.000000
mean      910.950000
std       270.799701
min       241.000000
25%       731.750000
50%       876.000000
75%      1068.500000
max      4304.000000
Name: num_tokens, dtype: float64

In [29]:
sampled_df["num_tokens"].quantile(0.70)

np.float64(1021.3)

# Pre-processamento de Datasets

In [30]:
from transformers import DataCollatorWithPadding

In [31]:
def tokenizer_function(example):
  # Em caso de uma tarefa de classificação de pares de texto, modificar este valor de retorno
  # truncation=True, padding="max_length", max_length=123 para truncar e padronizar os tamanhos de tokens!!!
  return tokenizer(
      example["text"], truncation=True, max_length=MAX_LEN
  )

In [32]:
# Tokenizando todos os elementos do conjunto de dados em batches
train_ds = train_ds.map(tokenizer_function, batched=True)
eval_ds = eval_ds.map(tokenizer_function, batched=True)
test_ds = test_ds.map(tokenizer_function, batched=True)

Map:   0%|          | 0/33620 [00:00<?, ? examples/s]

Map:   0%|          | 0/4203 [00:00<?, ? examples/s]

Map:   0%|          | 0/4203 [00:00<?, ? examples/s]

In [33]:
# Mantendo apenas colunas cujos nomes coincidam com os métodos forward dos transformers
train_ds = train_ds.remove_columns(column_names=["text"])
train_ds = train_ds.rename_column("label", "labels")
train_ds = train_ds.with_format("torch")

eval_ds = eval_ds.remove_columns(column_names=["text"])
eval_ds = eval_ds.rename_column("label", "labels")
eval_ds = eval_ds.with_format("torch")

test_ds = test_ds.remove_columns(column_names=["text"])
test_ds = test_ds.rename_column("label", "labels")
test_ds = test_ds.with_format("torch")

In [34]:
train_ds

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 33620
})

In [35]:
# Collator para Dynamic Padding
collator = DataCollatorWithPadding(tokenizer, padding="longest")

# Implementando trainer com loss customizada

In [36]:
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
from typing import Any, Optional, Union

In [37]:
train_df['modelo_nivel'].value_counts().loc[[0,1,2]]

modelo_nivel
0     7160
1    16091
2    10369
Name: count, dtype: int64

In [38]:
# inverso da frequência (ajustado para só 3 classes)
class_counts = torch.tensor(train_df['modelo_nivel'].value_counts().loc[[0,1,2]].values, dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu")
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * len(class_counts)

In [39]:
# Supondo que você já tenha contado os exemplos por classe
# class_counts = [100, 300, 50]  # Exemplo para 3 classes
# class_weights = [1.0 / c for c in class_counts]  # inversamente proporcional à frequência
# class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda")

loss_fn = nn.CrossEntropyLoss(weight=class_weights)


class WeightedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_fn(logits, labels) # usa a loss_fn definida acima

        return (loss, outputs) if return_outputs else loss

# Configurando Trainer

In [40]:
from transformers import get_cosine_schedule_with_warmup, get_constant_schedule
from torch.optim import AdamW
from transformers import EarlyStoppingCallback
from codecarbon import EmissionsTracker # para calcular emissões de CO2
from transformers import set_seed
import math
import torch
import time

In [41]:
balanced_loss = False
bs = 32
acc_steps = 1
epochs = 1
lr = 2e-4
wd = 1e-3
patience = 2
seed = 42
output_dir = f"./models_outputs/modern_bert_{epochs}epochs_{bs}bs_{lr}lr_{wd}wd_{acc_steps}accsteps/"
train_code_carbon_out = 'modernbert_fine_tuning_emissions_training.csv'
inference_code_carbon_out = 'modernbert_fine_tuning_emissions_inference.csv'

warmup_steps = math.ceil((len(train_ds)/bs) * epochs * 0.1) #10% of train data for warm-up
train_steps = int(epochs * len(train_ds)/bs)
es_callback = EarlyStoppingCallback(early_stopping_patience=patience)

In [42]:
warmup_steps, train_steps

(106, 1050)

In [43]:
import numpy as np
from sklearn.metrics import classification_report
# from datasets import load_metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics_dict = classification_report(labels, predictions, output_dict=True)

    return {
        "accuracy": metrics_dict["accuracy"],
        "precision": metrics_dict["macro avg"]["precision"],
        "recall": metrics_dict["macro avg"]["recall"],
        "f1": metrics_dict["macro avg"]["f1-score"]
    }


In [44]:
# optimizer = AdamW(model.parameters(), lr=lr)
# # scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps, num_cycles=0.5)
# scheduler = get_constant_schedule(optimizer) # testar este tbm

In [45]:
set_seed(seed)

In [46]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,                                                                              # total number of training epochs
    per_device_train_batch_size=bs,                                                                       # batch size per device during training
    per_device_eval_batch_size=bs,                                                                        # batch size for evaluation
    # warmup_steps=warmup_steps,                                                                            # number of warmup steps for learning rate scheduler
    eval_strategy="epoch",
    save_strategy="epoch",                                                                                # evaluation interval
    logging_dir=f"{output_dir}/logs",                                                                                 # directory for storing logs
    save_total_limit=patience+1,                                                                              # checkpoint save interval
    report_to='none',
    gradient_accumulation_steps=acc_steps,
    metric_for_best_model='loss',
    greater_is_better=False,
    load_best_model_at_end=True,
    bf16=True,
    learning_rate=lr,
    weight_decay=wd,
    warmup_ratio=0.03,
    logging_steps=100,
    logging_strategy='steps',
    log_level="info",
    seed=seed
)


In [47]:
if balanced_loss:
    print(f"Using balanced loss with class weights: {class_weights}")
    trainer = WeightedTrainer(
        model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        compute_metrics=compute_metrics,
        # optimizers=(optimizer, scheduler),
        data_collator=collator
    )
else:
    print(f"Using standard loss")
    trainer = Trainer(
        model,
        args=training_args,
        train_dataset=train_ds.select( range(int(len(train_ds)*0.02)) ),
        eval_dataset=eval_ds.select( range(int(len(eval_ds)*0.02)) ),
        compute_metrics=compute_metrics,
        # optimizers=(optimizer, scheduler),
        data_collator=collator
    )

trainer.add_callback(es_callback)

Using auto half precision backend


Using standard loss


In [48]:
print( model.device )

cuda:0


In [49]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
memory_before = torch.cuda.memory_allocated()
print(f"Memória ANTE DO treino: {memory_before/1024**2:.2f} MB")
trainer.train()
memory_after = torch.cuda.memory_allocated()
print(f"Memória atualmente alocada: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"Máxima memória usada até agora: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
print(f"Memória usada no treinamento: {(memory_after-memory_before)/1024**2:.2f} MB")



***** Running training *****
  Num examples = 672
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 21
  Number of trainable parameters = 149,607,171


Memória ANTE DO treino: 590.41 MB


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.00558,0.52381,0.174603,0.333333,0.229167



***** Running Evaluation *****
  Num examples = 84
  Batch size = 32
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Saving model checkpoint to ./models_outputs/modern_bert_1epochs_32bs_0.0002lr_0.001wd_1accsteps/checkpoint-21
Configuration saved in ./models_outputs/modern_bert_1epochs_32bs_0.0002lr_0.001wd_1accsteps/checkpoint-21/config.json
Model weights saved in ./models_outputs/modern_bert_1epochs_32bs_0.0002lr_0.001wd_1accsteps/checkpoint-21/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./models_outputs/modern_bert_1epochs_32bs_0.0002lr_0.001wd_1accsteps/checkpoint-21/tokenizer_config.json
Special tokens file saved in ./models_outputs/modern_bert_1epochs_32bs_0.0002lr_0.001wd_1accsteps/checkpoint-21/special_tok

Memória atualmente alocada: 1758.57 MB
Máxima memória usada até agora: 21306.83 MB
Memória usada no treinamento: 1168.16 MB


# Avaliação final

In [50]:
from sklearn.metrics import classification_report

In [51]:
# trainer._load_from_checkpoint("/exp/kenzosaki/data/LeanDL-HPC/models/modern_bert_10epochs_32bs_0.0002lr_0.001wd_1accsteps/checkpoint-6306")

In [53]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
memory_before = torch.cuda.memory_allocated()
print(f"Memória usada antes: {torch.cuda.memory_allocated()/1024**2:.2f} MB")

preds = trainer.predict(test_ds.select(range(100)))

print(f"Memória atualmente alocada: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"Máxima memória usada até agora: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
print(f"Memória usada no treinamento: {(memory_after-memory_before)/1024**2:.2f} MB")



***** Running Prediction *****
  Num examples = 100
  Batch size = 32


Memória usada antes: 1758.57 MB


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Memória atualmente alocada: 1758.57 MB
Máxima memória usada até agora: 2665.08 MB
Memória usada no treinamento: 0.00 MB
