**Nina Dobša, zadnje uređivano 7.7.2025.** 
Kod je pisan i pokretan u google colab-u radi jačeg procesora

# Imports

In [None]:
!pip install datasets

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, AdamW, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
import torch
from datasets import Dataset
import pandas as pd
from google.colab import files, drive
import os
import shutil

# Emotion Dataset

In [None]:
emotion_df = pd.read_parquet("hf://datasets/dair-ai/emotion/unsplit/train-00000-of-00001.parquet")
emotion_dataset = Dataset.from_pandas(emotion_df) # Switching to Dataset format

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Tokenization

In [None]:
# Loading tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')


In [None]:
# Tokenization of a dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = emotion_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

# Pretraining settings

In [None]:
# Preparing data for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15  # 15% of tokens will be masked
)

# Getting iterable dataset with iterator over batches of data
dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)

# Optimizer setup
optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
# Using GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

# Fine tuning the model

In [None]:
model.train()
for epoch in range(3):  # Training in 3 epochs

    epoch_loss = 0
    num_batches = 0

    for batch in dataloader:
        optimizer.zero_grad()

        # Moving input_ids, attention_mask and labels to the device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss 
        loss.backward() # Backpropagation
        optimizer.step() # Updating models parameters

        epoch_loss += loss.item()
        num_batches += 1

    # Calculating avarage loss per epoch
    avg_epoch_loss = epoch_loss / num_batches
    print(f"Epoch: {epoch + 1}, Average Loss: {avg_epoch_loss:.4f}")

Epoch: 1, Average Loss: 2.3754
Epoch: 2, Average Loss: 2.2510
Epoch: 3, Average Loss: 2.2069


... Code for fine tuning bert model was executing around 2 hours

In [None]:
# Saving the model and tokenizer to the local computer
model.save_pretrained("/content/fine_tuned_bert")
tokenizer.save_pretrained("/content/fine_tuned_bert")

shutil.make_archive("/content/fine_tuned_bert", 'zip', "/content/fine_tuned_bert")
files.download("/content/fine_tuned_bert.zip")