# pipeline_deberta-v3-small.ipynb
This notebook contains some template code to help you with loading/preprocessing the data.

We start with some imports and constants.
The training data is found in the `data` subfolder.
There is also a tokenizer I've trained for you which you can use for the project.

Can be executed once — does not depend on the execution environment.

In [None]:
# ========================================
# 1. Clone the Repository (if required)
# ========================================
TOKEN = "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"  # ← GitHub Personal Access Token (PAT)  ⚠️
# XXXXXXXXXXXXXXXXXXXXXXXXXXXX
!git clone https://{TOKEN}@github.com/KatsuhitoArasaka/BabyLM-Tiny.git  # your repository link  ⚠️
%cd BabyLM-Tiny

# ===============
# 2. Set Paths
# ===============
TRAIN_PATH = './data/train.txt'       # Path to your training data  ⚠️
DEV_PATH = './data/dev.txt'           # Path to your validation data  ⚠️
SPM_PATH = './data/tokenizer.model'   # Path to your tokenizer model  ⚠️

Execute after restarting the environment -- when changing the device (CPU ↔ GPU) you need to restart the kernel.

In [None]:
# ==========================
# 3. Install Dependencies
# ==========================
%pip install transformers datasets wandb trl bitsandbytes huggingface_hub --quiet


# ======================================
# 4. Authentication with wandb and hf
# ======================================
import wandb
wandb.login()  # Enter your API key when prompted ⚠️

from huggingface_hub import login
login()  # Paste your HF token from https://huggingface.co/settings/tokens  ⚠️
# Token: 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


# =====================================
# 5. Import Libraries and Set Device
# =====================================
import subprocess
import json
import torch
import datasets
from functools import partial
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, set_seed
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
from trl import SFTTrainer, SFTConfig

from transformers import set_seed
set_seed(42)


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Check for GPU availability
print(f"Using device: {DEVICE}")

In [None]:
# ================================================
# 6. Start a new wandb run to track this script
# ================================================
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="Low-Resource_Pretraining",
    # Set the wandb project where this run will be logged.
    project="NLP_LRP_BabyLM",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 0.02,  # the main parameter for configuring the optimizer
        "architecture": "DeBERTa",  # a description of the model architecture, to track which model was used in the project
        "epochs": 10,  # the number of training epochs, an important parameter for understanding the duration of the experiment and its settings

        # "dataset": "CIFAR-100",
        # "batch_size": 8,
    },
)

Here are we load the dataset:

In [None]:
# ===================
# 7. Load Datasets
# ===================

# loading datasets
# with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
#     train_data = [{"text": line.strip()} for line in f if line.strip()]

# with open(DEV_PATH, 'r', encoding='utf-8') as f:
#     val_data = [{"text": line.strip()} for line in f if line.strip()]

# For using the 500k datasets, we can't split by new line, so we split by sentences.
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
  full_text = f.read().strip()
  sentences = [s.strip() for s in full_text.split('.') if s.strip()]
  train_data = [{"text": s + "."} for s in sentences]  # add back the period if desired

with open(DEV_PATH, 'r', encoding='utf-8') as f:
  full_text = f.read().strip()
  sentences = [s.strip() for s in full_text.split('.') if s.strip()]
  val_data = [{"text": s + "."} for s in sentences]  # add back the period if desired

# create DatasetDict object
dataset = datasets.DatasetDict({
    "train": datasets.Dataset.from_list(train_data),
    "validation": datasets.Dataset.from_list(val_data)
})

In [None]:
# =====================================================================================
# 9. Setup Model, Training, and Logging (This is the part that may change per model)
# =====================================================================================

# Specific Settings for the Model you choose            ⚠️
# Replace this block with model-specific settings       ⚠️
model_name = "microsoft/deberta-v3-small"  # Example: DeBERTa model for MLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_config(config)

# Data collator to support masked language modeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    seed=0,
)

# ===============================
# 10. Setup Training Arguments
# ===============================
# train.py
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.flatten()
    labels = labels.flatten()
    mask = labels != -100
    labels = labels[mask]
    predictions = predictions[mask]

    correct = labels == predictions
    accuracy = correct.sum() / float(len(correct))
    return {"acc": accuracy}

# ======================
# 11. Train the Model
# ======================
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    data_collator = data_collator,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    args = SFTConfig(
        remove_unused_columns = True,
        label_names = ["labels"],
        dataset_num_proc = 12,
        packing = True,
        eval_packing = True,
        max_seq_length = 64,
        dataset_text_field = "text",
        eval_strategy = "steps",
        per_device_train_batch_size = 64,
        gradient_accumulation_steps = 1,
        warmup_ratio = 0.05,
        num_train_epochs = 10,
        learning_rate = 2e-4,
        fp16 = True,
        bf16 = False,
        logging_steps = 10,
        eval_steps = 100,
        save_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 0,
        # output_dir = "",
        report_to = "none",
        eval_accumulation_steps=1,
        include_for_metrics=[],
        max_grad_norm=1,
    ),
)


trainer.train()

In [4]:
# ============================================================
# 12. Save the trained model locally (for later evaluation)
# ============================================================

# This will save the model to a folder inside your Colab environment
# The folder will be deleted when the session ends unless uploaded elsewhere (e.g., Hugging Face, Google Drive)

# save_path = "./trained_models/model_name"  # ⚠️ Change `model_name` if training multiple models in one session
# trainer.save_model(save_path)        # Save model weights, config, etc.
# tokenizer.save_pretrained(save_path) # Save tokenizer (required for evaluation)

# OR use HF repository
save_path = "n1k1t427/deberta-v3-small-babylm_childes-default-dataset"

In [None]:
# ===============================================================
# 13. Evaluate the model using BabyLM evaluation pipeline 2025
# ===============================================================

import os
import pathlib
import shutil

!cp -r BabyLM-Tiny/evaluation_data evaluation-pipeline-2025/

# --- Configuration ---
EVAL_PIPELINE_REPO = "evaluation-pipeline-2025"
TARGET_REPO_DIR = "./"  # Path to the BabyLM-Tiny repo (assumes you're already inside it)

ARCHITECTURE = "mlm"  # "mlm", "causal", or "mntp"  # ⚠️
EVAL_MODE = "zero-shot-fast"  # "zero-shot", "zero-shot-fast", or "finetune"  # ⚠️
EVAL_DATA_FOLDER = "fast_eval"  # "full_eval" or "fast_eval" for "zero-shot-fast" # ⚠️
# EVAL_DATA_FOLDER is ignored when using EVAL_MODE = "finetune"
# because finetuning tasks load datasets directly from HuggingFace (e.g., GLUE), not from evaluation_data/.

DATASET_NAME = "childes-default-dataset"  # the name of the dataset your model was trained on  # ⚠️
ENABLE_GIT_PUSH = True  # set to True to automatically commit and push the result  # ⚠️

model_id_path = pathlib.Path(save_path).name.replace("/", "_")  # for results directory structure

# --- Step 1: Clone evaluation pipeline if not present ---
if not os.path.exists(EVAL_PIPELINE_REPO):
    subprocess.run(["git", "clone", "https://github.com/babylm/evaluation-pipeline-2025.git"], check=True)

# --- Step 2: Run the appropriate evaluation script ---
if EVAL_MODE == "zero-shot":
    eval_script = "eval_zero_shot.sh"
    eval_dir = os.path.join("evaluation_data", EVAL_DATA_FOLDER)
    cmd = ["bash", eval_script, save_path, ARCHITECTURE, eval_dir]
    working_dir = EVAL_PIPELINE_REPO

elif EVAL_MODE == "zero-shot-fast":
    eval_script = "eval_zero_shot_fast.sh"
    revision_name = "default"  # can be adjusted for checkpoints
    eval_dir = os.path.join("evaluation_data", EVAL_DATA_FOLDER)
    cmd = ["bash", eval_script, save_path, revision_name, ARCHITECTURE, eval_dir]
    working_dir = EVAL_PIPELINE_REPO

elif EVAL_MODE == "finetune":
    eval_script = "eval_finetune.sh"
    cmd = ["bash", eval_script, save_path]
    working_dir = EVAL_PIPELINE_REPO

else:
    raise ValueError(f"Unknown EVAL_MODE: {EVAL_MODE}")

subprocess.run(cmd, cwd=working_dir, check=True)

# --- Step 3: Locate the results.txt file ---
if EVAL_MODE.startswith("zero-shot"):
    eval_subfolder = "zero_shot"
elif EVAL_MODE == "finetune":
    eval_subfolder = "finetune"
else:
    raise RuntimeError("Unexpected EVAL_MODE value")

results_root = pathlib.Path(EVAL_PIPELINE_REPO) / "results" / model_id_path / "main" / eval_subfolder

# Search recursively for results.txt
candidate = None
for root, _, files in os.walk(results_root):
    for f in files:
        if f == "results.txt":
            candidate = os.path.join(root, f)
            break
if not candidate:
    raise FileNotFoundError("Evaluation completed, but results.txt was not found.")

In [None]:
# --- if wandb run hasn't been launched yet ---
run = wandb.init(
    name=f"{model_id_path}_{DATASET_NAME}_{EVAL_MODE}",
    entity="Low-Resource_Pretraining",
    project="NLP_LRP_BabyLM",
    config={
        "learning_rate": 0.02,
        "architecture": "DeBERTa",
        "epochs": 10,
    },
)


In [None]:
# --- Step 4: Log evaluation results to wandb ---
with open(candidate, "r") as f:
    lines = [line.strip() for line in f if line.strip()]

# Construct a clear log name: <model_id>_<dataset>_<eval_mode>
log_name = f"{model_id_path}_{DATASET_NAME}_{EVAL_MODE}"
wandb.log({log_name: wandb.Html("<br>".join(lines))})

# --- Step 5: Save results using <dataset>_<eval_mode>.txt format ---
output_dir = (
    pathlib.Path(TARGET_REPO_DIR)
    / "models_evaluation_results"
    / "deberta-v3-small_results"  # ⚠️
)
output_dir.mkdir(parents=True, exist_ok=True)

result_file_path = output_dir / f"{DATASET_NAME}_{EVAL_MODE}.txt"
shutil.copy(candidate, result_file_path)

print(f"✅ Evaluation complete. Results saved to {result_file_path}")

# --- Step 6: Optionally commit and push the result to GitHub ---
if ENABLE_GIT_PUSH:
    subprocess.run(["git", "add", str(result_file_path)], cwd=TARGET_REPO_DIR)
    subprocess.run(["git", "commit", "-m", f"Add {EVAL_MODE} evaluation results for {DATASET_NAME}"], cwd=TARGET_REPO_DIR)
    subprocess.run(["git", "push"], cwd=TARGET_REPO_DIR)

In [None]:
# ===============================================
# 14. Upload Trained Model to Hugging Face Hub
# ===============================================

from huggingface_hub import create_repo, upload_folder

hf_repo_name = "deberta-v3-small-babylm"  # Change this to a unique name for your model       ⚠️
hf_username = "n1k1t427"          # Replace with your actual Hugging Face username    ⚠️
repo_id = f"{hf_username}/{hf_repo_name}"

# Create a public repo (use private=True if needed)
create_repo(repo_id, exist_ok=True, private=True)

# Upload entire trained model folder
upload_folder(
    repo_id=repo_id,
    folder_path=save_path,
    path_in_repo=".",  # Upload everything from the folder
    repo_type="model"
)

print(f"Model uploaded to: https://huggingface.co/{repo_id}")

In [None]:
# ===========================
# 15. Finish wandb Logging
# ===========================
wandb.finish()  # log the final metrics and mark the run as complete