# 1. Setup and Install Dependencies


*   Install required libraries such as datasets, evaluate, and seqeval for  
    model training and evaluation.
*   Import necessary Python libraries including PyTorch, Transformers, and datasets.





In [None]:
!pip install datasets evaluate seqeval



# 2. Load Required Modules


*  Import PyTorch components for model building and training.
*  Load Transformers' modules for tokenization, model loading, and data processing.
*  Load dataset handling utilities from datasets.





In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from torch.optim import AdamW
from transformers import (
    AutoProcessor, AutoTokenizer, DistilBertTokenizer,
    DistilBertForTokenClassification, LayoutLMv3ForTokenClassification,
    default_data_collator, AutoModelForSequenceClassification, TrainingArguments, Trainer
)
from datasets import load_dataset, Features, Sequence, Value, Array2D, Array3D


# 3. Set Device

Determine if a GPU is available; otherwise, use the CPU.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# 4. Load Teacher and Student Tokenizers

Load the processor and tokenizer for the teacher model (LayoutLMv3) and the tokenizer for the student model (DistilBERT).

In [None]:
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
teacher_tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlmv3-base")
student_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# 5. Load and Process Dataset
* Load the FUNSD dataset for layout-aware token classification.

* Extract label mappings from the dataset.

* Define column names.

In [None]:
dataset = load_dataset("nielsr/funsd-layoutlmv3")
label_list = dataset['train'].features['ner_tags'].feature.names
id2label = {k: v for k, v in enumerate(label_list)}
label2id = {v: k for k, v in enumerate(label_list)}
num_labels = len(label_list)

image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"
column_names = dataset["train"].column_names

# 6. Define Example Preparation Functions

* Create functions for tokenizing teacher and student data.

* The teacher model uses images, text, and bounding boxes.

* The student model uses only text and includes dummy bounding boxes and pixel values.

In [None]:
def prepare_examples(examples):
    encoding = processor(
        examples[image_column_name],
        examples[text_column_name],
        max_length=teacher_tokenizer.model_max_length,
        boxes=examples[boxes_column_name],
        word_labels=examples[label_column_name],
        truncation=True,
        padding="max_length"
    )
    return encoding

def student_prepare_examples(examples):
    encoding = student_tokenizer(
        examples[text_column_name],
        is_split_into_words=True,
        max_length=teacher_tokenizer.model_max_length,
        truncation=True,
        padding="max_length"
    )
    encoding["bbox"] = [[[0, 0, 0, 0]] * teacher_tokenizer.model_max_length] * len(encoding["input_ids"])
    encoding["pixel_values"] = [np.zeros((3, 224, 224), dtype="float32").tolist()] * len(encoding["input_ids"])
    encoding["labels"] = [label + [-100] * (teacher_tokenizer.model_max_length - len(label)) for label in examples[label_column_name]]
    return encoding

# 7. Prepare Datasets for Training and Evaluation

* Convert the dataset into a format compatible with LayoutLMv3 and DistilBERT.

* Apply transformations to the dataset for teacher model.

In [None]:
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(teacher_tokenizer.model_max_length, 4)),
    'labels': Sequence(Value(dtype='int64')),
})

train_sample = dataset["train"].shuffle(seed=42).select(range(60))

train_dataset = train_sample.map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
test_sample = dataset["test"].shuffle(seed=42).select(range(30))
eval_dataset = test_sample.map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
validation_sample = dataset["test"].shuffle(seed=42).select(range(10))
validation_dataset = validation_sample.map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

train_dataset.set_format("torch")

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

# 8. Load Teacher Model
Load the LayoutLMv3 model and set it to evaluation mode.

In [None]:
teacher_model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    id2label=id2label,
    label2id=label2id
).to(device)
teacher_model.eval()

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

# 9. Fine-Tune the Teacher Model Using Hugging Face Trainer
Purpose:
 * Fine-tune the pre-trained teacher model on the dataset using the Trainer API.

Details:

* Loads the teacher model (LayoutLMv3) for token classification.

* Defines TrainingArguments such as output directory, batch sizes, learning rate, evaluation strategy, and number of steps.

* Implements a compute_metrics function to evaluate model performance during training.

* Initializes the Trainer with the teacher model, training/evaluation datasets, data collator, tokenizer (processor), and metric computation function.

* Calls trainer.train() to fine-tune the teacher model.

In [None]:
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer
from transformers.data.data_collator import default_data_collator

metric = evaluate.load('seqeval')
return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

training_args = TrainingArguments(output_dir="layoutlmv3-finetuned-cord_100",
                                  max_steps=1000,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  # push_to_hub=True,  # after training, we'd like to push our model to the hub
                                  # push_to_hub_model_id=f"layoutlmv3-finetuned-cord_100",
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=250,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="accuracy")

# Initialize our Trainer
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjayawasthi891[0m ([33mjayawasthi891-lnmiit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
250,No log,1.707841,0.834217,0.892308,0.862285,0.711489
500,0.276100,1.797852,0.872783,0.916599,0.894155,0.736207
750,0.276100,1.865106,0.86703,0.902834,0.88457,0.749852
1000,0.006900,1.903015,0.870543,0.909312,0.889505,0.752027




TrainOutput(global_step=1000, training_loss=0.14153105878829955, metrics={'train_runtime': 787.4481, 'train_samples_per_second': 5.08, 'train_steps_per_second': 1.27, 'total_flos': 1054421372928000.0, 'train_loss': 0.14153105878829955, 'epoch': 66.66666666666667})

In [None]:
student_model.save_pretrained("layoutlmv3_teacher_model")

# 10. Prepare Student Datasets for Training and Evaluation
* **Convert** the dataset into a format compatible with DistilBERT.

* Apply transformations to the dataset for student model.

In [None]:
train_sample = dataset["train"].shuffle(seed=42).select(range(60))
student_train_dataset = train_sample.map(
    student_prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

test_sample = dataset["test"].shuffle(seed=42).select(range(30))
student_eval_dataset = test_sample.map(
    student_prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

validation_sample = dataset["test"].shuffle(seed=42).select(range(10))
student_validation_dataset = validation_sample.map(
    student_prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

# 11. Load Student Model


In [None]:
student_model = DistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
).to(device)


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 12. Define Distillation Loss

* Compute KL divergence loss between teacher and student logits.

* Use a weighted sum of soft (KL loss) and hard (cross-entropy) losses.

In [None]:
def distillation_loss(student_logits, teacher_logits, labels, attention_mask, temperature=2.0, alpha=0.8):
    kl_loss_fn = nn.KLDivLoss(reduction="batchmean")
    ce_loss_fn = nn.CrossEntropyLoss()
    mask = (labels != -100)
    soft_loss = kl_loss_fn(
        torch.log_softmax(student_logits[mask] / temperature, dim=-1),
        torch.softmax(teacher_logits[mask] / temperature, dim=-1)
    )
    hard_loss = ce_loss_fn(student_logits[mask], labels[mask])
    return alpha * soft_loss + (1 - alpha) * hard_loss

# 13. Train the Student Model
 * Train the student model using knowledge distillation.

* Extract logits from the teacher model and use them in loss computation

In [None]:
def train_student_model(student_model, teacher_model, train_dataloader, optimizer, num_epochs=10, temperature=2.0, alpha=0.8):
    student_model.train()
    total_steps = len(train_dataloader)

    for epoch in range(num_epochs):
        epoch_loss = 0
        print(f"Epoch {epoch+1}/{num_epochs}")
        for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Processing Epoch {epoch+1}")):
            optimizer.zero_grad()

            # The default_data_collator ensures uniform tensor shapes
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            bbox = batch['bbox'].to(device)
            pixel_values = batch['pixel_values'].to(device)

            # Teacher forward pass (with no gradient)
            with torch.no_grad():
                teacher_outputs = teacher_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    bbox=bbox,
                    pixel_values=pixel_values,
                    labels=labels
                )
                teacher_logits = teacher_outputs.logits

            # Student forward pass (text only)
            student_outputs = student_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            student_logits = student_outputs.logits

            # Compute combined distillation loss
            loss = distillation_loss(student_logits, teacher_logits, labels, attention_mask, temperature, alpha)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            if batch_idx % 100 == 0:
                print(f"Batch {batch_idx}/{total_steps} Loss: {loss.item():.4f}")

        avg_epoch_loss = epoch_loss / total_steps
        print(f"Epoch {epoch+1} - Avg Loss: {avg_epoch_loss:.4f}")

#########################################
# 9. Set Up DataLoader and Optimizer    #
#########################################

# Use default_data_collator to ensure uniform tensor sizes
train_dataloader = DataLoader(student_train_dataset, batch_size=4, shuffle=True, collate_fn=default_data_collator)
optimizer = AdamW(student_model.parameters(), lr=2e-5)

#########################################
# 10. Start Training                    #
#########################################

train_student_model(student_model, teacher_model, train_dataloader, optimizer, num_epochs=10)

Epoch 1/10


Processing Epoch 1:   7%|▋         | 1/15 [00:00<00:10,  1.34it/s]

Batch 0/15 Loss: 0.5914


Processing Epoch 1: 100%|██████████| 15/15 [00:10<00:00,  1.38it/s]


Epoch 1 - Avg Loss: 0.4165
Epoch 2/10


Processing Epoch 2:   7%|▋         | 1/15 [00:00<00:11,  1.26it/s]

Batch 0/15 Loss: 0.5032


Processing Epoch 2: 100%|██████████| 15/15 [00:09<00:00,  1.56it/s]


Epoch 2 - Avg Loss: 0.4021
Epoch 3/10


Processing Epoch 3:   7%|▋         | 1/15 [00:00<00:08,  1.67it/s]

Batch 0/15 Loss: 0.4598


Processing Epoch 3: 100%|██████████| 15/15 [00:09<00:00,  1.61it/s]


Epoch 3 - Avg Loss: 0.4030
Epoch 4/10


Processing Epoch 4:   7%|▋         | 1/15 [00:00<00:08,  1.67it/s]

Batch 0/15 Loss: 0.3577


Processing Epoch 4: 100%|██████████| 15/15 [00:09<00:00,  1.55it/s]


Epoch 4 - Avg Loss: 0.3956
Epoch 5/10


Processing Epoch 5:   7%|▋         | 1/15 [00:00<00:08,  1.71it/s]

Batch 0/15 Loss: 0.3439


Processing Epoch 5: 100%|██████████| 15/15 [00:08<00:00,  1.70it/s]


Epoch 5 - Avg Loss: 0.3973
Epoch 6/10


Processing Epoch 6:   7%|▋         | 1/15 [00:00<00:09,  1.46it/s]

Batch 0/15 Loss: 0.4473


Processing Epoch 6: 100%|██████████| 15/15 [00:09<00:00,  1.63it/s]


Epoch 6 - Avg Loss: 0.3967
Epoch 7/10


Processing Epoch 7:   7%|▋         | 1/15 [00:00<00:08,  1.72it/s]

Batch 0/15 Loss: 0.3972


Processing Epoch 7: 100%|██████████| 15/15 [00:09<00:00,  1.64it/s]


Epoch 7 - Avg Loss: 0.3988
Epoch 8/10


Processing Epoch 8:   7%|▋         | 1/15 [00:00<00:08,  1.72it/s]

Batch 0/15 Loss: 0.2741


Processing Epoch 8: 100%|██████████| 15/15 [00:08<00:00,  1.67it/s]


Epoch 8 - Avg Loss: 0.3962
Epoch 9/10


Processing Epoch 9:   7%|▋         | 1/15 [00:00<00:09,  1.48it/s]

Batch 0/15 Loss: 0.4085


Processing Epoch 9: 100%|██████████| 15/15 [00:08<00:00,  1.69it/s]


Epoch 9 - Avg Loss: 0.3948
Epoch 10/10


Processing Epoch 10:   7%|▋         | 1/15 [00:00<00:08,  1.70it/s]

Batch 0/15 Loss: 0.3704


Processing Epoch 10: 100%|██████████| 15/15 [00:09<00:00,  1.63it/s]

Epoch 10 - Avg Loss: 0.3938





# 14. Evaluate and Save Student Model

Use seqeval to measure precision, recall, and F1-score of the student model.

In [None]:
student_model.save_pretrained("distilbert_student_model")

In [None]:
import evaluate

def evaluate_student_model(student_model, eval_dataloader, label_list, device="cuda"):
    student_model.eval()
    metric = evaluate.load("seqeval")
    true_predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            label_ids = labels.cpu().numpy()

            for prediction, label in zip(predictions, label_ids):
                true_predictions.append([
                    label_list[p] for p, l in zip(prediction, label) if l != -100
                ])
                true_labels.append([
                    label_list[l] for p, l in zip(prediction, label) if l != -100
                ])

    results = metric.compute(predictions=true_predictions, references=true_labels)
    overall_metrics = {
        "precision": results["overall_precision"]*100,
        "recall": results["overall_recall"]*100,
        "f1": results["overall_f1"]*100,
        "accuracy": results["overall_accuracy"]*100,
    }

    return overall_metrics

# Example usage:
eval_dataloader = DataLoader(student_validation_dataset, batch_size=4, collate_fn=default_data_collator)
evaluation_results = evaluate_student_model(student_model, eval_dataloader, label_list)
print(evaluation_results)

Evaluating: 100%|██████████| 3/3 [00:00<00:00,  5.94it/s]

{'precision': np.float64(41.66666666666667), 'recall': np.float64(2.277904328018223), 'f1': np.float64(4.319654427645789), 'accuracy': 21.173814898419867}



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
device = "cuda"
def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            bbox = batch["bbox"].to(device)
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            bbox=bbox,
                            pixel_values=pixel_values)

            logits = outputs.logits.detach().cpu().numpy()
            label_ids = labels.detach().cpu().numpy()

            predictions = np.argmax(logits, axis=2)

            for pred, true_labels in zip(predictions, label_ids):
                filtered_pred = [p for p, l in zip(pred, true_labels) if l != -100]
                filtered_labels = [l for l in true_labels if l != -100]
                all_predictions.extend(filtered_pred)
                all_labels.extend(filtered_labels)

    accuracy = accuracy_score(all_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average="macro")

    return {"accuracy": accuracy*100, "precision": precision, "recall": recall, "f1": f1}
eval_dataloader = DataLoader(validation_dataset, batch_size=4, collate_fn=default_data_collator)
evaluation_results = evaluate_model(teacher_model, eval_dataloader, device)
print(evaluation_results)



{'accuracy': 69.83016983016984, 'precision': 0.7645552881121446, 'recall': 0.780395810307793, 'f1': 0.7528805629617449}
