# pipeline_deberta-v3-small.ipynb
This notebook contains some template code to help you with loading/preprocessing the data.

We start with some imports and constants.
The training data is found in the `data` subfolder.
There is also a tokenizer I've trained for you which you can use for the project.

Can be executed once — does not depend on the execution environment.

In [None]:
# ========================================
# 1. Clone the Repository (if required)
# ========================================
TOKEN = "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"  # ← GitHub Personal Access Token (PAT)  ⚠️
# XXXXXXXXXXXXXXXXXXXXXXXXXXXX
!git clone https://{TOKEN}@github.com/KatsuhitoArasaka/BabyLM-Tiny.git  # your repository link  ⚠️
%cd BabyLM-Tiny

# ===============
# 2. Set Paths
# ===============
TRAIN_PATH = './data/train.txt'       # Path to your training data  ⚠️
DEV_PATH = './data/dev.txt'           # Path to your validation data  ⚠️
SPM_PATH = './data/tokenizer.model'   # Path to your tokenizer model  ⚠️
# Path to evaluating scripts
BLIMP_SCRIPT = "./evaluate_blimp.py"
GLUE_SCRIPT = "./evaluate_glue.py"

Execute after restarting the environment -- when changing the device (CPU ↔ GPU) you need to restart the kernel.

In [None]:
# ==========================
# 3. Install Dependencies
# ==========================
%pip install transformers datasets wandb trl bitsandbytes huggingface_hub --quiet

# ======================================
# 4. Authentication with wandb and hf
# ======================================
import wandb
wandb.login()  # Enter your API key when prompted ⚠️

from huggingface_hub import login
login()  # Paste your HF token from https://huggingface.co/settings/tokens  ⚠️


# =====================================
# 5. Import Libraries and Set Device
# =====================================
import subprocess
import json
import torch
import datasets
from functools import partial
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, set_seed
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
from trl import SFTTrainer, SFTConfig

from transformers import set_seed
set_seed(42)


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check for GPU availability
print(f"Using device: {DEVICE}")

In [None]:
# ================================================
# 6. Start a new wandb run to track this script
# ================================================
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="Low-Resource_Pretraining",
    # Set the wandb project where this run will be logged.
    project="NLP_LRP_BabyLM",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 0.02,  # the main parameter for configuring the optimizer
        "architecture": "DeBERTa",  # a description of the model architecture, to track which model was used in the project
        "epochs": 10,  # the number of training epochs, an important parameter for understanding the duration of the experiment and its settings

        # "dataset": "CIFAR-100",
        # "batch_size": 8,
    },
)

Here are we load the dataset:

In [None]:
# ================================
# 7. Load Datasets
# ================================

# loading datasets
# with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
#     train_data = [{"text": line.strip()} for line in f if line.strip()]

# with open(DEV_PATH, 'r', encoding='utf-8') as f:
#     val_data = [{"text": line.strip()} for line in f if line.strip()]

# For using the 500k datasets, we can't split by new line, so we split by sentences.
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
  full_text = f.read().strip()
  sentences = [s.strip() for s in full_text.split('.') if s.strip()]
  train_data = [{"text": s + "."} for s in sentences]  # add back the period if desired

with open(DEV_PATH, 'r', encoding='utf-8') as f:
  full_text = f.read().strip()
  sentences = [s.strip() for s in full_text.split('.') if s.strip()]
  val_data = [{"text": s + "."} for s in sentences]  # add back the period if desired

# create DatasetDict object
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_list(train_data),
    "validation": datasets.Dataset.from_list(val_data)
})

In [None]:
# =====================================================================================
# 9. Setup Model, Training, and Logging (This is the part that may change per model)
# =====================================================================================

# Specific Settings for the Model you choose            ⚠️
# Replace this block with model-specific settings       ⚠️
model_name = "microsoft/deberta-v3-small"  # Example: DeBERTa model for MLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_config(config)

# Data collator to support masked language modeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    seed=0,
)

# ===============================
# 10. Setup Training Arguments
# ===============================
# train.py
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.flatten()
    labels = labels.flatten()
    mask = labels != -100
    labels = labels[mask]
    predictions = predictions[mask]

    correct = labels == predictions
    accuracy = correct.sum() / float(len(correct))
    return {"acc": accuracy}

# ======================
# 11. Train the Model
# ======================
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    data_collator = data_collator,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    args = SFTConfig(
        remove_unused_columns = True,
        label_names = ["labels"],
        dataset_num_proc = 12,
        packing = True,
        eval_packing = True,
        max_seq_length = 64,
        dataset_text_field = "text",
        eval_strategy = "steps",
        per_device_train_batch_size = 64,
        gradient_accumulation_steps = 1,
        warmup_ratio = 0.05,
        num_train_epochs = 10,
        learning_rate = 2e-4,
        fp16 = True,
        bf16 = False,
        logging_steps = 10,
        eval_steps = 100,
        save_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 0,
        # output_dir = "",
        report_to = "none",
        eval_accumulation_steps=1,
        include_for_metrics=[],
        max_grad_norm=1,
    ),
)


trainer.train()

In [4]:
# ============================================================
# 12. Save the trained model locally (for later evaluation)
# ============================================================

# This will save the model to a folder inside your Colab environment
# The folder will be deleted when the session ends unless uploaded elsewhere (e.g., Hugging Face, Google Drive)
save_path = "./trained_models/model_name"  # ⚠️ Change `model_name` if training multiple models in one session
trainer.save_model(save_path)        # Save model weights, config, etc.
tokenizer.save_pretrained(save_path) # Save tokenizer (required for evaluation)

# OR use HF repository
save_path = "n1k1t427/deberta-v3-small-babylm_childes-default-dataset"

In [None]:
# ========================================
# 13. Evaluate the model: BLiMP
# ========================================

MODEL_TYPE = "encoder"  # or "decoder", if you train autoregressive model       ⚠️

# ---- Run BLiMP Evaluation ----
# add "--local_only" argument if using Colab folder.  ⚠️
blimp_output = subprocess.run(
    ["python", BLIMP_SCRIPT,
     "--model_type", MODEL_TYPE,
     "--model_path", save_path,
     "--batch_size", "16"],  # For Colab (T4/P100): usually 16 or 32.
    capture_output=True, text=True
)

# Parse BLiMP results from stdout
blimp_lines = blimp_output.stdout.strip().split("\n")
blimp_results = {line.split(":")[0].strip(" -"): float(line.split(":")[1]) for line in blimp_lines if ":" in line}
blimp_avg = blimp_results.get("Average", sum(blimp_results.values()) / len(blimp_results))

# ---- Log BLiMP to wandb ----
wandb.log({"blimp_avg": blimp_avg, "blimp_details": wandb.Html(blimp_output.stdout.replace('\n', '<br>'))})

# ---- Save BLiMP results ----
# Don't forget to create folder for a corresponding model, if not created                       ⚠️
# and replace `modelname` in 'modelname_results' and `dataset_date` in 'blimp_dataset_date',
# otherwise you may accidentally overwrite the results from another model or won't be able to save them.
with open("models_evaluation_results/modelname_results/blimp_dataset_date.json", "w") as f:
    json.dump(blimp_results, f, indent=2)

In [None]:
# ==========
#    GLUE
# ==========
# ---- Run GLUE Evaluation ----
glue_subsets = ['cola','sst2', 'mrpc', 'qnli', 'rte', 'boolq', 'multirc']
glue_scores = {}

# add "--local_only" argument if using Colab folder.  ⚠️
for subset in glue_subsets:
    glue_eval = subprocess.run(
        ["python", GLUE_SCRIPT,
         "--subset", subset,
         "--model_type", MODEL_TYPE,
         "--model_path", save_path],
        capture_output=True, text=True
    )

# Parse from print("Epoch: x, Result: ...") → take the last line
for line in glue_eval.stdout.strip().split("\n")[::-1]:
    if "Best result:" in line:
        glue_scores[subset] = float(line.split(":")[1])
        break

# ---- Log GLUE to wandb ----
wandb.log({"glue_avg": sum(glue_scores.values()) / len(glue_scores), **{f"glue_{k}": v for k, v in glue_scores.items()}})

# ---- Save GLUE results ----
# Don't forget to replace `results_glue_modelname` in the file name with the model name or date,        ⚠️
# otherwise you may accidentally overwrite the results from another model.
with open("models_evaluation_results/modelname_results/glue_dataset_date.json", "w") as f:
    json.dump(glue_scores, f, indent=2)

In [None]:
# ===============================================
# 14. Upload Trained Model to Hugging Face Hub
# ===============================================

from huggingface_hub import create_repo, upload_folder

hf_repo_name = "deberta-v3-small-babylm"  # Change this to a unique name for your model       ⚠️
hf_username = "n1k1t427"          # Replace with your actual Hugging Face username    ⚠️
repo_id = f"{hf_username}/{hf_repo_name}"

# Create a public repo (use private=True if needed)
create_repo(repo_id, exist_ok=True, private=True)

# Upload entire trained model folder
upload_folder(
    repo_id=repo_id,
    folder_path=save_path,
    path_in_repo=".",  # Upload everything from the folder
    repo_type="model"
)

print(f"Model uploaded to: https://huggingface.co/{repo_id}")

In [None]:
# ===========================
# 15. Finish wandb Logging
# ===========================
wandb.finish()  # log the final metrics and mark the run as complete