In [None]:
# Cell 1: Install necessary libraries for Hugging Face Transformers, Datasets, and Accelerate
!pip install -q --upgrade transformers datasets evaluate accelerate

In [None]:
# Cell 2: Import libraries, load dataset, initialize tokenizer, prepare data, and initialize model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import os

# Load the 'ag_news' dataset from Hugging Face Datasets
dataset = load_dataset("ag_news")

# Define the pre-trained model name to use
model_name = "distilbert-base-uncased"

# Initialize the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a function to tokenize the text in batches
def tokenize(batch):
    # Tokenize the 'text' column, padding to 'max_length' and truncating if longer
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# Apply the tokenization function to the entire dataset
tokenized = dataset.map(tokenize, batched=True)

# Rename the 'label' column to 'labels' as required by Hugging Face models for training
tokenized = tokenized.rename_column("label", "labels")

# Set the format of the dataset to PyTorch tensors for model input
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Create smaller subsets for training and testing to speed up experimentation
small_train = tokenized["train"].shuffle(seed=42).select(range(5000))
small_test = tokenized["test"].shuffle(seed=42).select(range(1000))

# Initialize the AutoModelForSequenceClassification with the pre-trained model and 4 output labels
# The warning about newly initialized weights is expected as we are adapting a pre-trained model
# for a new classification task (fine-tuning).
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Cell 3: Set the Weights & Biases API key as an environment variable
# This prevents interactive prompts for the API key during training.
os.environ["WANDB_API_KEY"] = "2e2f0e240c964487d3c54e115b1b572970a6da46"

In [None]:
# Cell 4: Define the TrainingArguments for the Trainer
training_args = TrainingArguments(
    output_dir="./results", # Directory where model predictions and checkpoints will be stored
    save_strategy="epoch",  # Save checkpoint every epoch
    logging_steps=50,       # Log training metrics every 50 steps
    per_device_train_batch_size=8, # Batch size per device during training
    num_train_epochs=2      # Total number of training epochs to perform
    # Note: 'evaluation_strategy' was removed due to a `TypeError` encountered in previous runs.
    # For the latest transformers versions, 'evaluation_strategy="epoch"' is typically valid.
    # If you need evaluation during training, consider restarting the runtime and re-adding this argument
    # after all library installations are confirmed to be active.
)

In [None]:
# Cell 5: Initialize the Hugging Face Trainer
trainer = Trainer(
    model=model,            # The model to be trained
    args=training_args,     # The training arguments defined above
    train_dataset=small_train, # The dataset for training
    eval_dataset=small_test # The dataset for evaluation (used if evaluation_strategy is set)
)

In [None]:
# Cell 6: Start the model training process
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmed-tabka[0m ([33mmed-tabka-ibs-international-business-school-budapest-vienna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
50,0.8028
100,0.4849
150,0.4173
200,0.4378
250,0.4254
300,0.3919
350,0.3177
400,0.378
450,0.4018
500,0.3462




TrainOutput(global_step=1250, training_loss=0.3270040199279785, metrics={'train_runtime': 7003.4093, 'train_samples_per_second': 1.428, 'train_steps_per_second': 0.178, 'total_flos': 331180308480000.0, 'train_loss': 0.3270040199279785, 'epoch': 2.0})

In [None]:
# Cell 7: Evaluate the trained model on the evaluation dataset and print the results
results = trainer.evaluate()
print(results)



{'eval_loss': 0.3506878614425659, 'eval_runtime': 189.9838, 'eval_samples_per_second': 5.264, 'eval_steps_per_second': 0.658, 'epoch': 2.0}


In [None]:
# Cell 8: Print the installed Transformers library version for reference
import transformers
print(f"Transformers version: {transformers.__version__}")

Transformers version: 4.57.3
