In [None]:
from transformers import BertModel, AutoTokenizer, AutoConfig, DistilBertForSequenceClassification
import torch

# 1. Load the Tokenizer (shared or compatible for both)
# It's common to use the teacher's tokenizer or a tokenizer compatible with both.
# For DistilBERT, the tokenizer is often the same as BERT's.
tokenizer_name = 'bert-base-uncased' # Or 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# 2. Load the Teacher Model (BERT-base)
teacher_model_name = 'bert-base-uncased'
teacher_model = BertModel.from_pretrained(teacher_model_name)
teacher_model.eval() # Set to evaluation mode, as we're not training it here.

# 3. Load the Student Model (DistilBERT-wiki-small) - our model
dump_path = "./serialization_dir/distilBERT_small_4_layer_2_epochs"
distilBERT_wiki_weights_path = dump_path + "/model_epoch_0.pth"

config = AutoConfig.from_pretrained(dump_path)
# Ensure num_labels is set in the config, as this drives the classification head size
config.num_labels = 2
config.return_dict = True # Make sure the config explicitly sets return_dict to True
config.output_hidden_states = False

distilBERT_wiki_small = DistilBertForSequenceClassification(config)

# Load the base model state_dict (only the distilbert part)
base_model_state_dict = torch.load(distilBERT_wiki_weights_path, map_location='cpu')

distilBERT_wiki_small.distilbert.load_state_dict(base_model_state_dict, strict=False)

print(distilBERT_wiki_small)
# Let's check the number of parameters to see the size difference
teacher_params = sum(p.numel() for p in teacher_model.parameters())
student_params = sum(p.numel() for p in distilBERT_wiki_small.parameters())

print(f"\nTeacher model parameters: {teacher_params:,}")
print(f"Student model parameters: {student_params:,}")
print(f"Student model is roughly {student_params/teacher_params:.2%} the size of the teacher model.")

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-3): 4 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [3]:
from datasets import load_dataset

# Load the SST-2 dataset from Hugging Face Datasets
sst2_dataset = load_dataset('glue', 'sst2')

# The dataset object contains train, validation (which we'll use as development), and test splits
print(sst2_dataset)

# Define a function to tokenize the text data with padding and truncation
max_length = 128
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding='max_length', max_length=max_length)

# Apply the tokenization to all splits of the dataset
tokenized_sst2 = sst2_dataset.map(tokenize_function, batched=True)

# The 'label' column is already numerical, so we don't need to process it further for SST-2.
# We will, however, remove the original 'sentence' column as the model will use the tokenized inputs.
tokenized_sst2 = tokenized_sst2.remove_columns(["sentence", "idx"])

# Rename the 'label' column to 'labels' as that's the expected name by PyTorch models
tokenized_sst2 = tokenized_sst2.rename_column("label", "labels")

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [4]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

repository_name = "LukasXperiaZ/distilBERT_wiki_small_2_epochs_SST-2"

# 1. Define Training Arguments
output_dir = './distilbert-wiki-small-2-epochs-sst2' # Directory to save the trained model and logs
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=512,    # 512 good for 24 GB VRAM
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch", # Save the model at the end of each epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric
    metric_for_best_model="accuracy",
    report_to="none", # Or "wandb" if you want to use Weights & Biases for logging
    hub_model_id=repository_name
)

# 2. Load the Pre-trained Model for Sequence Classification
model = distilBERT_wiki_small

# 3. Load the Evaluation Metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 4. Create a Trainer Instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_sst2["train"],
    eval_dataset=tokenized_sst2["validation"], # Using the 'validation' split as the development set
    compute_metrics=compute_metrics,
)

print("Trainer and model are ready for training!")

Trainer and model are ready for training!


In [5]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.697585,0.474771
2,No log,0.696239,0.480505
3,No log,0.696545,0.506881
4,0.683000,0.696301,0.50344
5,0.683000,0.69651,0.483945


TrainOutput(global_step=660, training_loss=0.6825483148748225, metrics={'train_runtime': 237.9087, 'train_samples_per_second': 1415.438, 'train_steps_per_second': 2.774, 'total_flos': 7485800358704640.0, 'train_loss': 0.6825483148748225, 'epoch': 5.0})

In [6]:
# Upload the model to huggingface
trainer.push_to_hub(commit_message="Add best DistilBERT wiki_small_epoch_2 model trained on SST-2")
tokenizer.push_to_hub(
    repo_id=repository_name,
    commit_message="Upload BERT tokenizer for DistilBERT model",
    blocking=True,  # Wait until the push is complete
)

model.safetensors:   0%|          | 0.00/211M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.78k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LukasXperiaZ/distilBERT_wiki_small_2_epochs_SST-2/commit/3c80cc91b13d6ea20077629b6639740a788602a7', commit_message='Upload BERT tokenizer for DistilBERT model', commit_description='', oid='3c80cc91b13d6ea20077629b6639740a788602a7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/LukasXperiaZ/distilBERT_wiki_small_2_epochs_SST-2', endpoint='https://huggingface.co', repo_type='model', repo_id='LukasXperiaZ/distilBERT_wiki_small_2_epochs_SST-2'), pr_revision=None, pr_num=None)

Validation
===
Execute from here on if you just want to evaluate

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
import evaluate
import torch

# 1. Specify your repository name on Hugging Face Hub
repository_name = 'LukasXperiaZ/distilBERT_wiki_small_2_epochs_SST-2'

# 2. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(repository_name)

# 3. Load the model from Hugging Face Hub
model = AutoModelForSequenceClassification.from_pretrained(repository_name)

# 4. Load the SST-2 validation dataset and tokenize it (if not already done)
sst2_validation_dataset = load_dataset('glue', 'sst2', split='validation')

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding='max_length', max_length=128)

tokenized_sst2_validation = sst2_validation_dataset.map(tokenize_function, batched=True)
tokenized_sst2_validation = tokenized_sst2_validation.remove_columns(["sentence", "idx"])
tokenized_sst2_validation = tokenized_sst2_validation.rename_column("label", "labels")
tokenized_sst2_validation.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 5. Create a DataLoader for the validation set
eval_dataloader = DataLoader(tokenized_sst2_validation, batch_size=32) # Use the same eval batch size

# 6. Define the evaluation metric
metric = evaluate.load("accuracy")

# 7. Move the model to the appropriate device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()

# 8. Evaluation loop
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

# 9. Print the evaluation results
eval_metric = metric.compute()
print(f"Validation set results (loaded from Hugging Face Hub): {eval_metric}")

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/211M [00:00<?, ?B/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Validation set results (loaded from Hugging Face Hub): {'accuracy': 0.5068807339449541}
