In [1]:
!pip freeze > requirements.txt

In [2]:
!pip install -r requirements.txt
!pip install datasets==2.15.0

Collecting cudf-cu12@ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (from -r requirements.txt (line 71))
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en_core_web_sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 (from -r requirements.txt (line 110))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hProcessing /colabtools/dist/google_colab-1.0.

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [4]:
import json
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch
import os

In [5]:
dataset = load_dataset("json", data_files={"treino": "/content/treino.jsonl", "teste": "/content/teste.jsonl"})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating treino split: 0 examples [00:00, ? examples/s]

Generating teste split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    treino: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 100
    })
    teste: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 100
    })
})

In [7]:
checkpoint = "bert-base-uncased"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

mapDict = {
    "suporte": 0,
    "venda": 1
}

def transform_labels(label):
  label = label["completion"]
  result = [] # Use .get() with a default to handle potential missing keys
  for l in label:
    result.append(mapDict[l])
  return {"label": result}


def tokenize_data(example):
  # Assuming the text to be classified is in the 'completion' column
  return tokenizer(example["prompt"], padding=True, truncation=True)

In [13]:
tokenized_datasets = dataset.map(tokenize_data, batched=True)
tokenized_datasets = tokenized_datasets.map(transform_labels, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
output_dir = "./bert-Sales-Challenge-Model-Test"

training_args = TrainingArguments(
    output_dir=output_dir,
    report_to='none'
)

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) # Ajustado para 2 classes

# Adicionar o mapeamento de ID para rótulo à configuração do modelo
# Invertemos o mapDict para ter {id: label_name}
id2label = {v: k for k, v in mapDict.items()}
model.config.id2label = id2label
model.config.label2id = mapDict # Opcional, mas útil ter o mapeamento inverso também

os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [17]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["treino"],
    eval_dataset=tokenized_datasets["teste"], # Alterado para usar o conjunto de teste para avaliação
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=39, training_loss=0.39418641114846253, metrics={'train_runtime': 135.2291, 'train_samples_per_second': 2.218, 'train_steps_per_second': 0.288, 'total_flos': 4008332484000.0, 'train_loss': 0.39418641114846253, 'epoch': 3.0})

In [19]:
trainer.evaluate()



{'eval_loss': 0.1111655980348587,
 'eval_accuracy': 0.99,
 'eval_runtime': 9.192,
 'eval_samples_per_second': 10.879,
 'eval_steps_per_second': 1.414,
 'epoch': 3.0}

In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
trainer.push_to_hub("LuaxSantos/SalesChallengeModel-Finetuning")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...llenge-Model-Test/model.safetensors:   1%|1         | 6.54MB /  438MB            

  ...llenge-Model-Test/training_args.bin:  15%|#5        |   892B / 5.78kB            

CommitInfo(commit_url='https://huggingface.co/LuaxSantos/bert-Sales-Challenge-Model-Test/commit/14a412ba8c155af15482e119adce9ed2ef8edbc3', commit_message='LuaxSantos/SalesChallengeModel-Finetuning', commit_description='', oid='14a412ba8c155af15482e119adce9ed2ef8edbc3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/LuaxSantos/bert-Sales-Challenge-Model-Test', endpoint='https://huggingface.co', repo_type='model', repo_id='LuaxSantos/bert-Sales-Challenge-Model-Test'), pr_revision=None, pr_num=None)

In [22]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="LuaxSantos/bert-Sales-Challenge-Model-Test")

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [25]:
pipe("quero comprar uma nova TV")

[{'label': 'venda', 'score': 0.8323453068733215}]