In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load sample dataset from scikit-learn (3 categories for faster training)
categories = ['sci.space', 'rec.autos', 'comp.graphics']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
data = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})

# Split data into train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 2065
Validation set size: 442
Test set size: 443


In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'],
                    padding="max_length",
                    truncation=True,
                    max_length=512)

# Convert to Datasets and tokenize
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Keep only necessary columns
columns_to_remove = [col for col in train_dataset.column_names
                    if col not in ['input_ids', 'attention_mask', 'label']]
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset = val_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

# Create a small sample for demonstration (2.5% of training data)
train_sample = train_dataset.shuffle(seed=42).select(range(int(0.025 * len(train_dataset))))

print("\nDataset format:")
print(train_sample[0])
print(f"\nSample size: {len(train_sample)} examples")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/2065 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/443 [00:00<?, ? examples/s]


Dataset format:
{'label': 0, 'input_ids': [101, 2013, 1024, 11320, 10286, 4305, 1030, 20116, 1012, 6901, 1012, 3968, 2226, 1006, 5696, 11320, 10286, 4305, 1007, 3395, 1024, 2559, 2005, 2720, 1012, 10958, 5280, 3029, 1024, 2110, 2118, 1997, 2047, 2259, 2012, 6901, 1013, 4012, 2361, 16596, 3210, 1024, 1023, 1050, 3372, 2361, 1011, 14739, 1011, 3677, 1024, 7570, 2863, 2213, 1012, 20116, 1012, 6901, 1012, 3968, 2226, 2515, 3087, 2031, 1037, 10958, 5280, 10938, 1999, 1039, 2008, 2027, 2071, 4604, 2033, 1029, 2151, 2393, 3970, 1010, 1011, 1011, 1064, 1012, 1011, 1010, 1001, 1001, 1001, 1064, 2005, 1037, 2843, 1997, 1012, 8740, 2189, 1024, 3027, 2361, 4165, 1012, 17371, 6342, 1012, 3968, 2226, 1064, 1013, 1013, 1035, 1035, 1010, 1035, 1001, 1001, 1001, 1064, 2059, 4937, 5371, 1012, 8740, 1028, 1013, 16475, 1013, 5746, 1064, 1032, 1035, 1028, 1013, 1028, 1035, 1013, 1006, 1035, 1013, 1032, 1035, 1013, 1026, 1028, 1035, 1064, 1057, 2497, 3075, 12105, 1024, 10093, 7159, 22285, 1012, 9353, 6342,

In [None]:
#!pip install peft
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
import torch
from transformers import DataCollatorWithPadding

# Initialize BERT model
base_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3
)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                     # Rank of the update matrices
    lora_alpha=16,          # Alpha scaling factor
    target_modules=["query", "key", "value"],  # Apply LoRA to attention layers
    lora_dropout=0.1,      # Dropout probability for LoRA layers
    bias="none",           # Don't train bias terms
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
)

# Create LoRA model
model = get_peft_model(base_model, lora_config)
print("Trainable parameters:")
model.print_trainable_parameters()

# Use num_train_epochs=1 to save time although results may be affected
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_lora',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs_lora',
    logging_steps=10,
    learning_rate=2e-4,  # Slightly higher learning rate for LoRA
    report_to="none"    # Disable wandb logging
)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sample,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Train the model
train_result = trainer.train()

print("\nLoRA Training completed!")
print(f"Training loss: {train_result.training_loss}")

# Save the LoRA model
model.save_pretrained("./lora_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable parameters:
trainable params: 444,675 || all params: 109,929,222 || trainable%: 0.4045


Step,Training Loss



LoRA Training completed!
Training loss: 1.1802357264927454
