In [None]:
!pip install transformers datasets

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import classification_report


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00

In [None]:
# Carregar o tokenizer e o modelo BERT para classificação
MODEL_NAME = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)  # 5 classes para os tempos litúrgicos


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Carregar o dataset de músicas litúrgicas
import json
from google.colab import files

# Fazer upload do arquivo de dataset
uploaded = files.upload()
with open('/content/dataset.json', 'r') as f:
    dataset = json.load(f)

# Separar dados de treino e validação
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Converter os dados em formato Dataset da Hugging Face
train_dataset = Dataset.from_dict({"text": [c["lyric"] for c in train_data], "label": [c["tempo"] for c in train_data]})
val_dataset = Dataset.from_dict({"text": [c["lyric"] for c in val_data], "label": [c["tempo"] for c in val_data]})


Saving dataset.json to dataset.json


In [None]:
# Mapeando os tempos litúrgicos para valores numéricos
label_mapping = {"natal": 0, "pascoa": 1, "quaresma": 2, "comum": 3, "advento": 4}
train_dataset = train_dataset.map(lambda examples: {"label": label_mapping[examples["label"]]})
val_dataset = val_dataset.map(lambda examples: {"label": label_mapping[examples["label"]]})

# Função de preprocessamento para tokenização
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Aplicar o preprocessamento nos datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:

# Definindo os parâmetros de treinamento
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=8,
    num_train_epochs=150,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=3,
    load_best_model_at_end=True,
)



In [None]:
# Função de métricas para avaliar o desempenho do modelo
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# Inicializando o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Iniciar o treinamento
trainer.train()

# Avaliação do modelo
eval_results = trainer.evaluate()
print(eval_results)

# Salvar o modelo treinado
trainer.save_model("./modelo_finetuned_bert")
torch.cuda.empty_cache()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.698107,0.3
2,No log,1.651453,0.1
3,No log,1.618146,0.2
4,No log,1.59769,0.25
5,No log,1.546707,0.35
6,No log,1.525814,0.35
7,No log,1.707545,0.25
8,No log,1.463222,0.35
9,No log,1.406208,0.35
10,No log,1.456278,0.35


{'eval_loss': 1.2586781978607178, 'eval_accuracy': 0.55, 'eval_runtime': 0.5737, 'eval_samples_per_second': 34.861, 'eval_steps_per_second': 5.229, 'epoch': 150.0}


In [None]:
# Carregar modelo para fazer previsões
model_path = "./modelo_finetuned_bert"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Prever rótulos das músicas na validação
predictions, true_labels = [], []
for example in val_dataset:
    inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    predictions.append(predicted_class)
    true_labels.append(example["label"])

# Avaliar as previsões
print(classification_report(true_labels, predictions, target_names=label_mapping.keys()))


              precision    recall  f1-score   support

       natal       0.50      0.33      0.40         3
      pascoa       0.67      0.67      0.67         3
    quaresma       0.38      0.75      0.50         4
       comum       0.67      0.40      0.50         5
     advento       0.75      0.60      0.67         5

    accuracy                           0.55        20
   macro avg       0.59      0.55      0.55        20
weighted avg       0.60      0.55      0.55        20



In [None]:
!zip -r modelo_finetuned_bert.zip ./modelo_finetuned_bert
from google.colab import files
files.download('modelo_finetuned_bert.zip')

  adding: modelo_finetuned_bert/ (stored 0%)
  adding: modelo_finetuned_bert/model.safetensors (deflated 7%)
  adding: modelo_finetuned_bert/vocab.txt (deflated 53%)
  adding: modelo_finetuned_bert/training_args.bin (deflated 51%)
  adding: modelo_finetuned_bert/special_tokens_map.json (deflated 42%)
  adding: modelo_finetuned_bert/tokenizer_config.json (deflated 75%)
  adding: modelo_finetuned_bert/config.json (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>