In [None]:
!pip install transformers datasets torch sentencepiece evaluate lxml



In [None]:
import os
import torch
import time
from lxml import etree
import pandas as pd
from torch.utils.data import Dataset
from datasets import load_dataset
import evaluate
from transformers import (
    SqueezeBertTokenizer,
    SqueezeBertForMaskedLM,
    SqueezeBertForSequenceClassification,
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from google.colab import files


In [None]:
# The XML file name
DUMP_FILE = "enwiki-20251201-pages-articles-multistream1.xml-p1p41242"

print("Checking if XML dump exists in Colab directory...\n")

# Show directory contents
print("Files in /content:")

# If the file exists then skip the browse
if os.path.exists(DUMP_FILE):
    print(f"File already exists: {DUMP_FILE}")
    print("No upload needed.\n")

else:
    print(f"File '{DUMP_FILE}' not found.\n")
    print("Please upload the XML Wikipedia dump file now.")

    # Ask user to upload
    uploaded = files.upload()

    # Get uploaded filename
    uploaded_filename = list(uploaded.keys())[0]

    # Rename to match expected filename
    if uploaded_filename != DUMP_FILE:
        os.rename(uploaded_filename, DUMP_FILE)
        print(f"Renamed '{uploaded_filename}' â†’ '{DUMP_FILE}'")

    print("\nUpload complete! The XML file is ready for processing.")

Checking if XML dump exists in Colab directory...

Files in /content:
File already exists: enwiki-20251201-pages-articles-multistream1.xml-p1p41242
No upload needed.



In [None]:
# Extraxt the info from the XML
from lxml import etree

DUMP_FILE = "enwiki-20251201-pages-articles-multistream1.xml-p1p41242"
wiki_texts = []

# recover=True allows parsing truncated or malformed XML
context = etree.iterparse(DUMP_FILE, events=("end",), tag="{http://www.mediawiki.org/xml/export-0.11/}page", recover=True)

for event, elem in context:
    text_elem = elem.find("{http://www.mediawiki.org/xml/export-0.11/}revision/{http://www.mediawiki.org/xml/export-0.11/}text")
    if text_elem is not None and text_elem.text:
        wiki_texts.append(text_elem.text.strip())
    elem.clear()
    if len(wiki_texts) >= 100:
        break  # stop after 1000 articles

print(f"Extracted {len(wiki_texts)} articles.")




Extracted 100 articles.


In [None]:
# SqueezeBERT tokenizer
sqbert_tokenizer = SqueezeBertTokenizer.from_pretrained('squeezebert/squeezebert-uncased')

# BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
# Tokenize Wikipedia texts for MLM
tokenized_inputs = sqbert_tokenizer(
    wiki_texts,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=128
)

# Create DataCollator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=sqbert_tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Simple torch dataset wrapper
class MLMDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].size(0)
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

mlm_dataset = MLMDataset(tokenized_inputs)


In [None]:
# Load SqueezeBERT for MLM
mlm_model = SqueezeBertForMaskedLM.from_pretrained('squeezebert/squeezebert-uncased')

# Training arguments for MLM
mlm_args = TrainingArguments(
    output_dir="./mlm_results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=50,
    save_total_limit=1,
    logging_dir="./mlm_logs",
    logging_steps=10,
    learning_rate=5e-5
)

# Trainer for MLM
mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_args,
    train_dataset=mlm_dataset,
    data_collator=data_collator
)

# Train domain-adaptive MLM
mlm_trainer.train()
mlm_trainer.save_model("./squeezebert_mlm_adapted")


Step,Training Loss
10,3.2656
20,2.6849


In [None]:
# Load the GLUE dataset for benchmarking
dataset_name = "mrpc"
dataset = load_dataset("glue", dataset_name)
metric = evaluate.load("glue", dataset_name)

print("Sample data:", dataset['train'][0])


Sample data: {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


In [None]:
def tokenize_pair(examples, tokenizer):
    return tokenizer(
        examples['sentence1'], examples['sentence2'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# Tokenize for SqueezeBERT
encoded_sqbert = dataset.map(lambda x: tokenize_pair(x, sqbert_tokenizer), batched=True)
encoded_sqbert = encoded_sqbert.map(lambda x: {'labels': x['label']}, batched=True)
encoded_sqbert.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Tokenize for BERT
encoded_bert = dataset.map(lambda x: tokenize_pair(x, bert_tokenizer), batched=True)
encoded_bert = encoded_bert.map(lambda x: {'labels': x['label']}, batched=True)
encoded_bert.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
# Load vanilla BERT for sequence classification for my significant contribution
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',  # pretrained BERT
    num_labels=2          # binary classification for MRPC or similar tasks
)

# No custom layers added here; this is standard BERT


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Function to define compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to predicted class by argmax
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    # Compute accuracy, F1, or other metrics provided by GLUE
    return metric.compute(predictions=predictions, references=labels)


In [None]:
# Baseline SqueezeBERT
sq_model_base = SqueezeBertForSequenceClassification.from_pretrained(
    "./squeezebert_mlm_adapted", num_labels=2
)

# Modified SqueezeBERT (same weights, separate instance)
sq_model_mod = SqueezeBertForSequenceClassification.from_pretrained(
    "./squeezebert_mlm_adapted", num_labels=2
)


Some weights of SqueezeBertForSequenceClassification were not initialized from the model checkpoint at ./squeezebert_mlm_adapted and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of SqueezeBertForSequenceClassification were not initialized from the model checkpoint at ./squeezebert_mlm_adapted and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Creat the base BERT model
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Baseline SqueezeBERT
training_args_sq_base = TrainingArguments(
    output_dir="./sq_base",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    do_eval=True,
    fp16=True
)

# Modified SqueezeBERT (different hyperparameters)
training_args_sq_mod = TrainingArguments(
    output_dir="./sq_modified",
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=3e-5,
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    weight_decay=0.02,
    do_eval=True,
    fp16=True
)

# BERT Training
training_args_bert = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    do_eval=True,
    fp16=True
)


In [None]:
# Baseline SqueezeBERT Trainer
trainer_sq_base = Trainer(
    model=sq_model_base,
    args=training_args_sq_base,
    train_dataset=encoded_sqbert['train'],
    eval_dataset=encoded_sqbert['validation'],
    tokenizer=sqbert_tokenizer,
    compute_metrics=compute_metrics
)

# Modified SqueezeBERT Trainer
trainer_sq_mod = Trainer(
    model=sq_model_mod,
    args=training_args_sq_mod,
    train_dataset=encoded_sqbert['train'],
    eval_dataset=encoded_sqbert['validation'],
    tokenizer=sqbert_tokenizer,
    compute_metrics=compute_metrics
)

# BERT Trainer
trainer_bert = Trainer(
    model=bert_model,
    args=training_args_bert,
    train_dataset=encoded_bert['train'],
    eval_dataset=encoded_bert['validation'],
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics
)


  trainer_sq_base = Trainer(
  trainer_sq_mod = Trainer(
  trainer_bert = Trainer(


In [None]:
# Train the SqueezeBERT model
trainer_sq_base.train()
trainer_sq_base.save_model("./sq_base_model")

# Train the Modified SqueezeBERT
trainer_sq_mod.train()
trainer_sq_mod.save_model("./sq_modified_model")

# Train BERT
trainer_bert.train()
trainer_bert.save_model("./bert_model")


Step,Training Loss
100,0.6062
200,0.4877
300,0.4
400,0.3444
500,0.298
600,0.2459


Step,Training Loss
50,0.6229
100,0.5566
150,0.503
200,0.4378
250,0.3967
300,0.3495
350,0.3483
400,0.2826
450,0.2928


Step,Training Loss
100,0.6241
200,0.5352
300,0.411
400,0.3463
500,0.332
600,0.2238


In [None]:
# evalute each model based on the research papers GLUE metric
print("Baseline SqueezeBERT Evaluation:")
print(trainer_sq_base.evaluate())

print("\nModified SqueezeBERT Evaluation:")
print(trainer_sq_mod.evaluate())

print("\nBERT Evaluation:")
print(trainer_bert.evaluate())

Baseline SqueezeBERT Evaluation:


{'eval_loss': 0.3745376765727997, 'eval_accuracy': 0.8651960784313726, 'eval_f1': 0.9046793760831889, 'eval_runtime': 12.0944, 'eval_samples_per_second': 33.735, 'eval_steps_per_second': 2.15, 'epoch': 3.0}

Modified SqueezeBERT Evaluation:


{'eval_loss': 0.3677081763744354, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.8811188811188811, 'eval_runtime': 11.8983, 'eval_samples_per_second': 34.291, 'eval_steps_per_second': 1.093, 'epoch': 4.0}

BERT Evaluation:


{'eval_loss': 0.4333786368370056, 'eval_accuracy': 0.8431372549019608, 'eval_f1': 0.887719298245614, 'eval_runtime': 0.8685, 'eval_samples_per_second': 469.764, 'eval_steps_per_second': 29.936, 'epoch': 3.0}


In [44]:
# Move models to device and use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sq_model_base.to(device)
sq_model_mod.to(device)
bert_model.to(device)

# Sample 100 XML sentences
sample_texts = wiki_texts[:100]

# Dictionary for tokenizers
tokenizers = {
    "SqueezeBERT Base": sqbert_tokenizer,
    "SqueezeBERT Tuned": sqbert_tokenizer,
    "BERT": bert_tokenizer
}

# Models dictionary
models = {
    "SqueezeBERT Base": sq_model_base,
    "SqueezeBERT Tuned": sq_model_mod,
    "BERT": bert_model
}

# DataFrame to store timing results
results_df = pd.DataFrame(columns=["Model", "Inference Time (s)"])

for model_name, model in models.items():
    tokenizer = tokenizers[model_name]
    inputs = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    start_time = time.time()
    with torch.no_grad():
        outputs = model(**inputs)
    end_time = time.time()

    inference_time = end_time - start_time
    print(f"{model_name} processed 100 sentences in {inference_time:.2f} seconds")

    results_df = pd.concat([results_df, pd.DataFrame({
        "Model": [model_name],
        "Inference Time (s)": [inference_time]
    })])

results_df.reset_index(drop=True, inplace=True)
results_df


SqueezeBERT Base processed 100 sentences in 0.04 seconds


  results_df = pd.concat([results_df, pd.DataFrame({


SqueezeBERT Tuned processed 100 sentences in 0.02 seconds
BERT processed 100 sentences in 0.01 seconds


Unnamed: 0,Model,Inference Time (s)
0,SqueezeBERT Base,0.035624
1,SqueezeBERT Tuned,0.021204
2,BERT,0.014304
