converter2.py

In [None]:
# convert_train_gpt2_checkpoint.py
# This script converts a checkpoint to a format compatible with transformers.GPT2LMHeadModel

import sys
import os
import struct
import numpy as np
import torch
from transformers import GPT2Config, GPT2LMHeadModel

# Define your input and output file paths here
INPUT_FILE_PATH = "/content/model_00037500.bin"  # Replace with actual input file path
OUTPUT_FILE_PATH = "/content/converted_model_00037500.pt"  # Replace with actual output file path

def read_header(f):
    # header is 256 ints (32-bit), little-endian
    header_bytes = f.read(256 * 4)
    if len(header_bytes) != 256 * 4:
        raise RuntimeError("Header too short")
    header = struct.unpack("<256i", header_bytes)
    return header

def bfloat16_to_float32(bf16_arr_uint16):
    # Convert numpy uint16 array representing bfloat16 into float32 array
    u32 = (bf16_arr_uint16.astype(np.uint32) << 16)
    return u32.view(np.float32)

def main(inp_path, out_path):
    print("Loading:", inp_path)
    with open(inp_path, "rb") as f:
        header = read_header(f)
        magic = header[0]
        version = header[1]
        max_seq_len = int(header[2])
        vocab_size = int(header[3])
        num_layers = int(header[4])
        num_heads = int(header[5])
        channels = int(header[6])
        padded_vocab_size = int(header[7])

        print("Header:")
        print(" magic:", magic)
        print(" version:", version)
        print(" max_seq_len:", max_seq_len)
        print(" vocab_size:", vocab_size)
        print(" num_layers:", num_layers)
        print(" num_heads:", num_heads)
        print(" channels:", channels)
        print(" padded_vocab_size:", padded_vocab_size)

        # remaining bytes = parameters blob
        raw = f.read()
    total_bytes = len(raw)
    print("Parameter blob bytes:", total_bytes)

    # interpret dtype from version
    # in train_gpt2.cu they used: version == 3 => FP32, else (5) => BF16
    if version == 3:
        dtype = np.float32
        element_bytes = 4
        params = np.frombuffer(raw, dtype=np.float32)
    elif version == 5:
        # stored as bfloat16 (uint16)
        element_bytes = 2
        tmp = np.frombuffer(raw, dtype=np.uint16)
        params = bfloat16_to_float32(tmp)
    else:
        # fallback: try float32
        print("Unknown version in header; trying float32")
        dtype = np.float32
        params = np.frombuffer(raw, dtype=np.float32)

    total_floats = params.size
    print("Total floats in blob:", total_floats)

    # compute param sizes exactly as in your C++ snippet
    Vp = padded_vocab_size
    C = channels
    maxT = max_seq_len
    L = num_layers

    param_counts = []
    # indices 0..15 per your layout
    param_counts.append(Vp * C)              # wte
    param_counts.append(maxT * C)            # wpe
    param_counts.append(L * C)               # ln1w
    param_counts.append(L * C)               # ln1b
    param_counts.append(L * (3 * C) * C)     # qkvw
    param_counts.append(L * (3 * C))         # qkvb
    param_counts.append(L * C * C)           # attprojw
    param_counts.append(L * C)               # attprojb
    param_counts.append(L * C)               # ln2w
    param_counts.append(L * C)               # ln2b
    param_counts.append(L * (4 * C) * C)     # fcw
    param_counts.append(L * (4 * C))         # fcb
    param_counts.append(L * C * (4 * C))     # fcprojw
    param_counts.append(L * C)               # fcprojb
    param_counts.append(C)                   # lnfw
    param_counts.append(C)                   # lnfb

    expected_total = sum(param_counts)
    print("Expected float count by summing param sizes:", expected_total)
    if expected_total != total_floats:
        print("WARNING: expected_total != total_floats. This might still work if padding present.")
        print(" expected_total:", expected_total)
        print(" total_floats:", total_floats)
        # We won't abort yet; proceed but check offsets carefully.

    # helper indexing
    offset = 0
    # container for parameters re-ordered to HF names
    hf_state = {}

    # 0. wte: token embeddings: shape (Vp, C) -> HF 'transformer.wte.weight' (vocab_size, C)
    cnt = param_counts[0]
    wte = params[offset:offset + cnt].reshape((Vp, C))
    offset += cnt
    # HF expects (vocab_size, C) ; we will keep full padded shape but slice to vocab_size
    hf_state["transformer.wte.weight"] = torch.tensor(wte[:vocab_size, :].astype(np.float32))
    print("wte done", hf_state["transformer.wte.weight"].shape)

    # 1. wpe: positional embeddings (maxT, C) -> HF 'transformer.wpe.weight' (n_positions, C)
    cnt = param_counts[1]
    wpe = params[offset:offset + cnt].reshape((maxT, C))
    offset += cnt
    hf_state["transformer.wpe.weight"] = torch.tensor(wpe.astype(np.float32))
    print("wpe done", hf_state["transformer.wpe.weight"].shape)

    # 2-3. ln1w, ln1b : per-layer ln1 weight/bias
    cnt = param_counts[2]
    ln1w = params[offset:offset + cnt].reshape((L, C))
    offset += cnt
    cnt = param_counts[3]
    ln1b = params[offset:offset + cnt].reshape((L, C))
    offset += cnt
    # 4. qkvw : L * (3C) * C -> reshape (L, 3C, C), but HF wants c_attn.weight shape (C, 3C)
    cnt = param_counts[4]
    qkvw = params[offset:offset + cnt].reshape((L, 3 * C, C))
    offset += cnt
    # 5. qkvb : L * (3C)
    cnt = param_counts[5]
    qkvb = params[offset:offset + cnt].reshape((L, 3 * C))
    offset += cnt
    # 6. attprojw : L * C * C -> reshape (L, C, C) => HF c_proj.weight (C, C)
    cnt = param_counts[6]
    attprojw = params[offset:offset + cnt].reshape((L, C, C))
    offset += cnt
    # 7. attprojb : L * C
    cnt = param_counts[7]
    attprojb = params[offset:offset + cnt].reshape((L, C))
    offset += cnt
    # 8-9 ln2w ln2b
    cnt = param_counts[8]
    ln2w = params[offset:offset + cnt].reshape((L, C))
    offset += cnt
    cnt = param_counts[9]
    ln2b = params[offset:offset + cnt].reshape((L, C))
    offset += cnt
    # 10-11 fcw, fcb  fcw: L*(4C)*C -> (L, 4C, C)
    cnt = param_counts[10]
    fcw = params[offset:offset + cnt].reshape((L, 4 * C, C))
    offset += cnt
    cnt = param_counts[11]
    fcb = params[offset:offset + cnt].reshape((L, 4 * C))
    offset += cnt
    # 12-13 fcprojw (L, C, 4C), fcprojb (L, C)
    cnt = param_counts[12]
    fcprojw = params[offset:offset + cnt].reshape((L, C, 4 * C))
    offset += cnt
    cnt = param_counts[13]
    fcprojb = params[offset:offset + cnt].reshape((L, C))
    offset += cnt
    # 14-15 ln final
    cnt = param_counts[14]
    lnfw = params[offset:offset + cnt].reshape((C,))
    offset += cnt
    cnt = param_counts[15]
    lnfb = params[offset:offset + cnt].reshape((C,))
    offset += cnt

    print("Sliced parameters, final offset:", offset, " / ", total_floats)


    for i in range(L):
        hf_state[f"transformer.h.{i}.ln_1.weight"] = torch.tensor(ln1w[i].astype(np.float32))
        hf_state[f"transformer.h.{i}.ln_1.bias"] = torch.tensor(ln1b[i].astype(np.float32))

        # qkv: qkvw[i] has shape (3C, C) -> transpose to (C, 3C) for HF c_attn.weight
        # HF c_attn.weight multiplies inputs (batch, T, C) by W -> (C,3C) to produce (batch,T,3C)
        qkvw_i = qkvw[i]  # shape (3C, C)
        # transpose to (C, 3C)
        hf_state[f"transformer.h.{i}.attn.c_attn.weight"] = torch.tensor(qkvw_i.T.astype(np.float32))
        hf_state[f"transformer.h.{i}.attn.c_attn.bias"] = torch.tensor(qkvb[i].astype(np.float32))

        # attention projection
        hf_state[f"transformer.h.{i}.attn.c_proj.weight"] = torch.tensor(attprojw[i].astype(np.float32))
        hf_state[f"transformer.h.{i}.attn.c_proj.bias"] = torch.tensor(attprojb[i].astype(np.float32))

        hf_state[f"transformer.h.{i}.ln_2.weight"] = torch.tensor(ln2w[i].astype(np.float32))
        hf_state[f"transformer.h.{i}.ln_2.bias"] = torch.tensor(ln2b[i].astype(np.float32))

        # mlp / feed-forward
        hf_state[f"transformer.h.{i}.mlp.c_fc.weight"] = torch.tensor(fcw[i].astype(np.float32))
        hf_state[f"transformer.h.{i}.mlp.c_fc.bias"] = torch.tensor(fcb[i].astype(np.float32))
        hf_state[f"transformer.h.{i}.mlp.c_proj.weight"] = torch.tensor(fcprojw[i].astype(np.float32))
        hf_state[f"transformer.h.{i}.mlp.c_proj.bias"] = torch.tensor(fcprojb[i].astype(np.float32))

    hf_state["transformer.ln_f.weight"] = torch.tensor(lnfw.astype(np.float32))
    hf_state["transformer.ln_f.bias"]   = torch.tensor(lnfb.astype(np.float32))

    # LM head: tie to wte (HuggingFace typically uses tied embeddings)
    # We'll save 'lm_head.weight' as identical to wte (vocab_size x C)
    hf_state["lm_head.weight"] = hf_state["transformer.wte.weight"]

    # Create a GPT-2 model and load to validate shapes
    config = GPT2Config(
        vocab_size=vocab_size,
        n_positions=max_seq_len,
        n_ctx=max_seq_len,
        n_embd=C,
        n_layer=L,
        n_head=num_heads,
    )
    print("Creating HuggingFace GPT2 model to validate shapes...")
    model = GPT2LMHeadModel(config)
    model_state = model.state_dict()

    # Fill a new state dict aligning keys
    new_state = {}
    missing = []
    for k in model_state.keys():
        if k in hf_state:
            val = hf_state[k]
            if tuple(val.shape) != tuple(model_state[k].shape):
                print(f"Shape mismatch for {k}: converted {tuple(val.shape)} vs model {tuple(model_state[k].shape)}")
                # Try transpose heuristics if off-by-transpose
                if val.ndim == 2 and tuple(val.shape[::-1]) == tuple(model_state[k].shape):
                    print("Attempting to transpose to match shape.")
                    val = val.T
                else:
                    # leave it and hope for best - but report
                    pass
            new_state[k] = val
        else:
            missing.append(k)

    if missing:
        print("WARNING: The following model keys were not found in converted params (they will remain random):")
        print(missing)

    # Load into HF model (non-strict to allow any small mismatches)
    print("Loading into HF model (non-strict)...")
    model.load_state_dict(new_state, strict=False)

    # Save converted state dict
    print("Saving converted state dict to:", out_path)
    torch.save(model.state_dict(), out_path)
    print("Done. You can now load with GPT2LMHeadModel.from_pretrained or torch.load.")

if __name__ == "__main__":
    # Execute with predefined paths instead of command line arguments
    print(f"Converting model from: {INPUT_FILE_PATH}")
    print(f"Saving converted model to: {OUTPUT_FILE_PATH}")
    main(INPUT_FILE_PATH, OUTPUT_FILE_PATH)

Converting model from: /content/model_00037500.bin
Saving converted model to: /content/converted_model_00037500.pt
Loading: /content/model_00037500.bin
Header:
 magic: 20240326
 version: 5
 max_seq_len: 1024
 vocab_size: 50257
 num_layers: 12
 num_heads: 12
 channels: 768
 padded_vocab_size: 50304
Parameter blob bytes: 248951808
Total floats in blob: 124475904
Expected float count by summing param sizes: 124475904
wte done torch.Size([50257, 768])
wpe done torch.Size([1024, 768])
Sliced parameters, final offset: 124475904  /  124475904
Creating HuggingFace GPT2 model to validate shapes...
Shape mismatch for transformer.h.0.mlp.c_fc.weight: converted (3072, 768) vs model (768, 3072)
Attempting to transpose to match shape.
Shape mismatch for transformer.h.0.mlp.c_proj.weight: converted (768, 3072) vs model (3072, 768)
Attempting to transpose to match shape.
Shape mismatch for transformer.h.1.mlp.c_fc.weight: converted (3072, 768) vs model (768, 3072)
Attempting to transpose to match shap

In [None]:
import os
import torch
import torch.nn.functional as F
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from tqdm import tqdm
import json
import requests

# ---------------------------------------------------------------------
# Settings
CHECKPOINT_PATH = "/content/converted_model_00037500.pt"   # <-- change this to your checkpoint
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_CACHE_DIR = "./hellaswag"
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

hellaswags = {
    "val": "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
}


# ---------------------------------------------------------------------
# Load dataset
def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

for split, url in hellaswags.items():
    cache_path = os.path.join(DATA_CACHE_DIR, f"{split}.jsonl")
    if not os.path.exists(cache_path):
        r = requests.get(url)
        with open(cache_path, "wb") as f:
            f.write(r.content)
    hellaswags[split] = load_jsonl(cache_path)

# ---------------------------------------------------------------------
# Load tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# ---------------------------------------------------------------------
# Load your custom model checkpoint
print(f"Loading custom GPT-2 model from {CHECKPOINT_PATH}...")
config = {
    "vocab_size": tokenizer.vocab_size,
    "n_positions": 1024,
    "n_ctx": 1024,
    "n_embd": 768,
    "n_layer": 12,
    "n_head": 12,
}

model = GPT2LMHeadModel.from_pretrained(
    "gpt2",  # use base gpt2 config as template
)
state_dict = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
model.load_state_dict(state_dict, strict=False)
model.to(DEVICE)
model.eval()

# ---------------------------------------------------------------------
# Evaluation
def score_completion(prompt, continuation):
    text = prompt + " " + continuation
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        loss = model(**inputs, labels=inputs["input_ids"]).loss
    return -loss.item()

correct = 0
total = 0
for ex in tqdm(hellaswags["val"], desc="Evaluating"):
    ctx = ex["ctx"]
    endings = ex["endings"]
    label = int(ex["label"])

    scores = [score_completion(ctx, cand) for cand in endings]
    pred = max(range(len(scores)), key=lambda i: scores[i])

    correct += (pred == label)
    total += 1

print(f"HellaSwag accuracy: {correct/total:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading custom GPT-2 model from /content/converted_model_00037500.pt...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Evaluating:   0%|          | 0/10042 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Evaluating: 100%|██████████| 10042/10042 [08:34<00:00, 19.51it/s]

HellaSwag accuracy: 0.2471



