# Importando dados processados

In [1]:
import pickle 
import datasets
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets.__version__

'4.0.0'

In [3]:
with open("/exp/kenzosaki/data/LeanDL-HPC/my_data.pickle", "rb") as file:
    split_train = pickle.load(file)
    split_eval = pickle.load(file)
    split_test = pickle.load(file)

In [4]:
train_df = split_train.to_pandas()
eval_df = split_eval.to_pandas()
test_df = split_test.to_pandas()

# Exploração dos dados

In [5]:
train_df.columns

Index(['hash_id', 'tema_id', 'tema', 'palavras_chave', 'uf_tema_info',
       'uf_pesquisador', 'nome_programa', 'sigla_entidade_ensino',
       'nome_producao', 'nome_subtipo_producao', 'nome_area_concentracao',
       'nome_linha_pesquisa', 'nome_projeto', 'descricao_palavra_chave',
       'descricao_resumo', 'descricao_abstract', 'descricao_keyword',
       'data_titulacao', 'nome_grau_academico',
       'nome_grande_area_conhecimento', 'nome_area_conhecimento',
       'nome_subarea_conhecimento', 'nome_especialidade', 'modelo_nivel',
       'modelo_explicacao', '__index_level_0__'],
      dtype='object')

In [6]:
train_df['tema'].nunique(), len(train_df)

(437, 33620)

In [7]:
train_df['modelo_nivel'].value_counts()/len(train_df)

modelo_nivel
BAIXA    0.478614
MEDIA    0.308418
ALTA     0.212968
Name: count, dtype: float64

In [8]:
# TODO: remover
# train_df = train_df.sample(1000, random_state=2025) # para teste

# Preparando inputs para o ModernBert

In [9]:
train_df = train_df.dropna()

In [10]:
train_df['palavras_chave'] = train_df['palavras_chave'].apply(lambda array: ", ".join(array.tolist()))
eval_df['palavras_chave'] = eval_df['palavras_chave'].apply(lambda array: ", ".join(array.tolist()))
test_df['palavras_chave'] = test_df['palavras_chave'].apply(lambda array: ", ".join(array.tolist()))

In [11]:
# input_cols = ['nome_producao', 'nome_area_concentracao', 'nome_linha_pesquisa', 'nome_projeto', 'descricao_palavra_chave', 'descricao_resumo', 'tema', 'palavras_chave']
input_cols = ['nome_projeto', 'descricao_palavra_chave', 'descricao_resumo', 'tema', 'palavras_chave']
input_cols_mapping = {col: col.replace('_', ' ').capitalize() for col in input_cols}
input_cols_mapping

{'nome_projeto': 'Nome projeto',
 'descricao_palavra_chave': 'Descricao palavra chave',
 'descricao_resumo': 'Descricao resumo',
 'tema': 'Tema',
 'palavras_chave': 'Palavras chave'}

In [12]:
def create_input_from_row(row: pd.Series) -> str:
  input_str = ""

  for col, formated_name in input_cols_mapping.items():
    input_str += f"{formated_name}: {row[col].capitalize()}\n"

  return input_str

In [13]:
train_df["bert_raw_inputs"] = [create_input_from_row(row) for _, row in train_df.iterrows()]
eval_df["bert_raw_inputs"] = [create_input_from_row(row) for _, row in eval_df.iterrows()]
test_df["bert_raw_inputs"] = [create_input_from_row(row) for _, row in test_df.iterrows()]

# Splits de treino e teste

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoder = LabelEncoder()
train_df['modelo_nivel'] = label_encoder.fit_transform(train_df['modelo_nivel'])
eval_df['modelo_nivel'] = label_encoder.transform(eval_df['modelo_nivel'])
test_df['modelo_nivel'] = label_encoder.transform(test_df['modelo_nivel'])


# Hugginface

In [16]:
from datasets import Dataset
from transformers import AutoTokenizer, ModernBertForSequenceClassification
from transformers import DataCollatorWithPadding

In [17]:
train_ds = Dataset.from_dict(
    {
        "text": train_df['bert_raw_inputs'].tolist(),
        "label": train_df['modelo_nivel'].tolist()
    }
)

eval_ds = Dataset.from_dict(
    {
        "text": eval_df['bert_raw_inputs'].tolist(),
        "label": eval_df['modelo_nivel'].tolist()
    }
)

test_ds = Dataset.from_dict(
    {
        "text": test_df['bert_raw_inputs'].tolist(),
        "label": test_df['modelo_nivel'].tolist()
    }
)

In [18]:
MAX_LEN = 1024
model = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model, truncation_side='left')
model = ModernBertForSequenceClassification.from_pretrained(model, num_labels=3, max_position_embeddings=MAX_LEN) # talvez 2048

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

# Analisando número de tokens no conjunto de dados inteiro

In [20]:
def get_num_tokens(text: str) -> int:
  return len(tokenizer.encode(text))

In [21]:
sampled_df = train_df.sample(1000, random_state=2025)
# sampled_df = dados_df

In [22]:
sampled_df["num_tokens"] = sampled_df["bert_raw_inputs"].apply(get_num_tokens)

In [23]:
sampled_df["num_tokens"].describe()

count    1000.000000
mean      910.950000
std       270.799701
min       241.000000
25%       731.750000
50%       876.000000
75%      1068.500000
max      4304.000000
Name: num_tokens, dtype: float64

In [24]:
sampled_df["num_tokens"].describe()

count    1000.000000
mean      910.950000
std       270.799701
min       241.000000
25%       731.750000
50%       876.000000
75%      1068.500000
max      4304.000000
Name: num_tokens, dtype: float64

In [25]:
sampled_df["num_tokens"].quantile(0.70)

np.float64(1021.3)

# Pre-processamento de Datasets

In [26]:
from transformers import DataCollatorWithPadding

In [27]:
def tokenizer_function(example):
  # Em caso de uma tarefa de classificação de pares de texto, modificar este valor de retorno
  # truncation=True, padding="max_length", max_length=123 para truncar e padronizar os tamanhos de tokens!!!
  return tokenizer(
      example["text"], truncation=True, max_length=MAX_LEN
  )

In [28]:
# Tokenizando todos os elementos do conjunto de dados em batches
train_ds = train_ds.map(tokenizer_function, batched=True)
eval_ds = eval_ds.map(tokenizer_function, batched=True)
test_ds = test_ds.map(tokenizer_function, batched=True)

Map: 100%|██████████| 33620/33620 [00:06<00:00, 5174.78 examples/s]
Map: 100%|██████████| 4203/4203 [00:00<00:00, 5172.55 examples/s]
Map: 100%|██████████| 4203/4203 [00:00<00:00, 5345.13 examples/s]


In [29]:
# Mantendo apenas colunas cujos nomes coincidam com os métodos forward dos transformers
train_ds = train_ds.remove_columns(column_names=["text"])
train_ds = train_ds.rename_column("label", "labels")
train_ds = train_ds.with_format("torch")

eval_ds = eval_ds.remove_columns(column_names=["text"])
eval_ds = eval_ds.rename_column("label", "labels")
eval_ds = eval_ds.with_format("torch")

test_ds = test_ds.remove_columns(column_names=["text"])
test_ds = test_ds.rename_column("label", "labels")
test_ds = test_ds.with_format("torch")

In [30]:
train_ds

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 33620
})

In [31]:
# Collator para Dynamic Padding
collator = DataCollatorWithPadding(tokenizer, padding="longest")

In [32]:
tokenizer.decode(train_ds[0]['input_ids'])

'[CLS]idismo cultural. o objetivo da pesquisa foi elaborar heurísticas para a criação do produto de moda para sustentabilidade com preceitos do hibridismo cultural. para isso, pesquisou-se por meio de revisões narrativas e sistemáticas a fim de encontrar os principais conceitos, ênfases e lacunas das temáticas que permeiam o estudo. os resultados obtidos das revisões foram trabalhos que predominantemente discutem as temáticas de maneira separada, ou enfocam apenas em análises de construção do vestuário, ou discorrem sobre a percepção do hibridismo na moda, ou, ainda, expõem cenários para aplicação de projetos sustentáveis pela moda. portanto, encontrou-se uma interseção entre as temáticas a serem exploradas. para tanto, utilizou-se o método heurístico, que busca no conhecimento tácito os indícios para a emersão do conhecimento sistematizado. as etapas do método constituem os ciclos heurísticos, onde na observação de processos criativos da moda autoral buscou-se os dados a serem analisa

# Implementando trainer com loss customizada

In [33]:
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
from typing import Any, Optional, Union

In [34]:
train_df['modelo_nivel'].value_counts().loc[[0,1,2]]

modelo_nivel
0     7160
1    16091
2    10369
Name: count, dtype: int64

In [35]:
# inverso da frequência (ajustado para só 3 classes)
class_counts = torch.tensor(train_df['modelo_nivel'].value_counts().loc[[0,1,2]].values, dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu")
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * len(class_counts)

In [36]:
# Supondo que você já tenha contado os exemplos por classe
# class_counts = [100, 300, 50]  # Exemplo para 3 classes
# class_weights = [1.0 / c for c in class_counts]  # inversamente proporcional à frequência
# class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda")

loss_fn = nn.CrossEntropyLoss(weight=class_weights)


class WeightedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = loss_fn(logits, labels) # usa a loss_fn definida acima

        return (loss, outputs) if return_outputs else loss

# Configurando Trainer

In [37]:
from transformers import get_cosine_schedule_with_warmup, get_constant_schedule
from torch.optim import AdamW
from transformers import EarlyStoppingCallback
from codecarbon import EmissionsTracker # para calcular emissões de CO2
from transformers import set_seed
import math
import torch
import time

In [None]:
balanced_loss = True
bs = 32
acc_steps = 1
epochs = 10
lr = 2e-4
wd = 1e-3
patience = 2
seed = 42
crit = 'f1'
evals_per_epoch = 1
output_dir = f"/exp/kenzosaki/data/LeanDL-HPC/models/balanced_modern_bert_{epochs}epochs_{bs}bs_{lr}lr_{wd}wd_{acc_steps}accsteps/"
train_code_carbon_out = 'balanced_modernbert_fine_tuning_emissions_training.csv'
inference_code_carbon_out = 'balanced_modernbert_fine_tuning_emissions_inference.csv'

warmup_steps = math.ceil((len(train_ds)/bs) * epochs * 0.1) #10% of train data for warm-up
train_steps = int(epochs * len(train_ds)/bs)
es_callback = EarlyStoppingCallback(early_stopping_patience=patience)

final_bs = bs * acc_steps
epoch_steps = math.ceil(len(train_ds)/final_bs)
train_steps = int(epochs * epoch_steps)
eval_steps = int(epoch_steps / evals_per_epoch) # evaluating every 10% of the train data

In [39]:
warmup_steps, train_steps

(1051, 10510)

In [40]:
import numpy as np
from sklearn.metrics import classification_report
# from datasets import load_metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics_dict = classification_report(labels, predictions, output_dict=True)

    return {
        "accuracy": metrics_dict["accuracy"],
        "precision": metrics_dict["macro avg"]["precision"],
        "recall": metrics_dict["macro avg"]["recall"],
        "f1": metrics_dict["macro avg"]["f1-score"]
    }


In [41]:
# optimizer = AdamW(model.parameters(), lr=lr)
# # scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps, num_cycles=0.5)
# scheduler = get_constant_schedule(optimizer) # testar este tbm

In [None]:
set_seed(seed)

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,                                                                              # total number of training epochs
    per_device_train_batch_size=bs,                                                                       # batch size per device during training
    per_device_eval_batch_size=bs,                                                                        # batch size for evaluation
    # warmup_steps=warmup_steps,                                                                            # number of warmup steps for learning rate scheduler
    eval_strategy="steps",
    save_strategy="steps",                                                                                # evaluation interval
    logging_dir=f"{output_dir}/logs",                                                                                 # directory for storing logs
    save_total_limit=patience+1,                                                                              # checkpoint save interval
    report_to='none',
    gradient_accumulation_steps=acc_steps,
    metric_for_best_model=crit,
    greater_is_better=False,
    load_best_model_at_end=True,
    bf16=True,
    learning_rate=lr,
    weight_decay=wd,
    warmup_ratio=0.03,
    logging_steps=100,
    logging_strategy='steps',
    log_level="info",
    eval_steps=eval_steps,
    save_steps=eval_steps,
    seed=seed
)


In [44]:
if balanced_loss:
    print(f"Using balanced loss with class weights: {class_weights}")
    trainer = WeightedTrainer(
        model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        compute_metrics=compute_metrics,
        # optimizers=(optimizer, scheduler),
        data_collator=collator
    )
else:
    print(f"Using standard loss")
    trainer = Trainer(
        model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        compute_metrics=compute_metrics,
        # optimizers=(optimizer, scheduler),
        data_collator=collator
    )

trainer.add_callback(es_callback)

Using auto half precision backend


Using balanced loss with class weights: tensor([1.4048, 0.6251, 0.9701], device='cuda:0')


In [45]:
train_df['modelo_nivel'].value_counts().loc[[0,1,2]]

modelo_nivel
0     7160
1    16091
2    10369
Name: count, dtype: int64

In [None]:
start_time = time.time()
tracker = EmissionsTracker(output_file=train_code_carbon_out)
tracker.start()

trainer.train()

emissions = tracker.stop()
print("\n\nTotal de emissões (detalhes em emissions.csv): ",emissions)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\nTempo total de execução: {elapsed_time:.2f} segundos")

[codecarbon INFO @ 13:19:30] [setup] RAM Tracking...
[codecarbon INFO @ 13:19:30] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 13:19:31] CPU Model on constant consumption mode: Intel(R) Core(TM) i9-14900KF
[codecarbon INFO @ 13:19:31] [setup] GPU Tracking...
[codecarbon INFO @ 13:19:31] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 13:19:31] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 13:19:31] >>> Tracker's metadata:
[codecarbon INFO @ 13:19:31]   Platform system: Linux-6.14.0-28-generic-x86_64-with-glibc2.39
[codecarbon INFO @ 13:19:31]   Python version: 3.13.5
[codecarbon INFO @ 13:19:31]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 13:19:31]   Available RAM : 125.634 GB
[codecarbon INF

Step,Training Loss,Validation Loss


[codecarbon INFO @ 13:19:50] Energy consumed for RAM : 0.000164 kWh. RAM Power : 38.0 W
[codecarbon INFO @ 13:19:50] Delta energy consumed for CPU with cpu_load : 0.000055 kWh, power : 12.805211970560002 W
[codecarbon INFO @ 13:19:50] Energy consumed for All CPU : 0.000055 kWh
[codecarbon INFO @ 13:19:50] Energy consumed for all GPUs : 0.000909 kWh. Total GPU Power : 204.37357853564714 W
[codecarbon INFO @ 13:19:50] 0.001127 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:20:05] Energy consumed for RAM : 0.000317 kWh. RAM Power : 38.0 W
[codecarbon INFO @ 13:20:05] Delta energy consumed for CPU with cpu_load : 0.000052 kWh, power : 12.8053388792 W
[codecarbon INFO @ 13:20:05] Energy consumed for All CPU : 0.000107 kWh
[codecarbon INFO @ 13:20:05] Energy consumed for all GPUs : 0.001842 kWh. Total GPU Power : 224.02428039356613 W
[codecarbon INFO @ 13:20:05] 0.002265 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:20:20] Energy consumed for RAM : 0.

KeyboardInterrupt: 

[codecarbon INFO @ 13:25:05] Delta energy consumed for CPU with cpu_load : 0.000052 kWh, power : 12.8119196864 W
[codecarbon INFO @ 13:25:05] Energy consumed for All CPU : 0.001138 kWh
[codecarbon INFO @ 13:25:05] Energy consumed for all GPUs : 0.020709 kWh. Total GPU Power : 226.8982889683421 W
[codecarbon INFO @ 13:25:05] 0.025224 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:25:20] Energy consumed for RAM : 0.003530 kWh. RAM Power : 38.0 W
[codecarbon INFO @ 13:25:20] Delta energy consumed for CPU with cpu_load : 0.000052 kWh, power : 12.801489032000001 W
[codecarbon INFO @ 13:25:20] Energy consumed for All CPU : 0.001190 kWh
[codecarbon INFO @ 13:25:20] Energy consumed for all GPUs : 0.021044 kWh. Total GPU Power : 80.40982664245486 W
[codecarbon INFO @ 13:25:20] 0.025763 kWh of electricity used since the beginning.
[codecarbon INFO @ 13:25:35] Energy consumed for RAM : 0.003683 kWh. RAM Power : 38.0 W
[codecarbon INFO @ 13:25:35] Delta energy consumed for CPU 

: 

# Avaliação final

In [None]:
from sklearn.metrics import classification_report

In [None]:
tracker = EmissionsTracker(output_file=inference_code_carbon_out)
tracker.start()
preds = trainer.predict(test_ds)
emissions = tracker.stop()
print(f"- Emissões durante a inferência: {emissions}")


***** Running Prediction *****
  Num examples = 4203
  Batch size = 32


In [None]:
label_encoder.classes_

array(['ALTA', 'BAIXA', 'MEDIA'], dtype=object)

In [None]:
print(classification_report(test_ds['labels'], preds.predictions.argmax(axis=-1), target_names=label_encoder.classes_))

              precision    recall  f1-score   support

        ALTA       0.41      0.70      0.51       863
       BAIXA       0.65      0.53      0.58      2056
       MEDIA       0.36      0.28      0.32      1284

    accuracy                           0.49      4203
   macro avg       0.47      0.51      0.47      4203
weighted avg       0.51      0.49      0.49      4203



In [None]:
print(classification_report(test_ds['labels'], preds.predictions.argmax(axis=-1)))

              precision    recall  f1-score   support

           0       0.41      0.70      0.51       863
           1       0.65      0.53      0.58      2056
           2       0.36      0.28      0.32      1284

    accuracy                           0.49      4203
   macro avg       0.47      0.51      0.47      4203
weighted avg       0.51      0.49      0.49      4203

