In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# model = AutoModelForCausalLM.from_pretrained("idhant297/dense-5l-arxiv_code_simplestories")

# config = GPT2Config.from_pretrained("idhant297/dense-5l-arxiv_code_simplestories")

from transformers import GPT2LMHeadModel, GPT2Config

# First, load the config
config = GPT2Config.from_pretrained("idhant297/dense-5l-arxiv_code_simplestories")

# Then load the model
model = GPT2LMHeadModel.from_pretrained("idhant297/dense-5l-arxiv_code_simplestories", config=config)

In [None]:
# Let's examine the original checkpoint structure
import torch

# Load the original checkpoint
checkpoint = torch.load("best_val_loss_dense_step_9000.pt", map_location="cpu")

print("Original checkpoint keys:")
for k in checkpoint["model_state_dict"].keys():
    print(f"  {k}: {checkpoint['model_state_dict'][k].shape}")

print("\nFirst few parameters:")
for i, (k, v) in enumerate(checkpoint["model_state_dict"].items()):
    if i < 5:
        print(f"  {k}: {v.shape}")


Original checkpoint keys:
  tok_embedding.weight: torch.Size([50257, 768])
  layers.0.attention.c_attn.weight: torch.Size([2304, 768])
  layers.0.attention.c_proj.weight: torch.Size([768, 768])
  layers.0.ffn.w_1.weight: torch.Size([3072, 768])
  layers.0.ffn.w_1.bias: torch.Size([3072])
  layers.0.ffn.w_2.weight: torch.Size([3072, 768])
  layers.0.ffn.w_2.bias: torch.Size([3072])
  layers.0.ffn.out.weight: torch.Size([768, 3072])
  layers.0.ffn.out.bias: torch.Size([768])
  layers.0.attn_norm.w: torch.Size([768])
  layers.0.ffn_norm.w: torch.Size([768])
  layers.1.attention.c_attn.weight: torch.Size([2304, 768])
  layers.1.attention.c_proj.weight: torch.Size([768, 768])
  layers.1.ffn.w_1.weight: torch.Size([3072, 768])
  layers.1.ffn.w_1.bias: torch.Size([3072])
  layers.1.ffn.w_2.weight: torch.Size([3072, 768])
  layers.1.ffn.w_2.bias: torch.Size([3072])
  layers.1.ffn.out.weight: torch.Size([768, 3072])
  layers.1.ffn.out.bias: torch.Size([768])
  layers.1.attn_norm.w: torch.Size([

In [2]:
# Fixed conversion function based on actual checkpoint structure
import json
import torch
from huggingface_hub import HfApi, create_repo

# 1. Create config.json - Update vocab_size to match actual model (50257 instead of 50256)
config_dict = {
    "model_type": "gpt2",
    "vocab_size": 50257,  # Updated to match actual model
    "n_positions": 1024,
    "n_embd": 768,
    "n_layer": 5,
    "n_head": 12,
    "n_inner": 3072,  # Usually 4 * n_embd for GPT-2
    "activation_function": "gelu_new",
    "resid_pdrop": 0.1,
    "embd_pdrop": 0.1,
    "attn_pdrop": 0.1,
    "layer_norm_epsilon": 1e-06,
    "initializer_range": 0.02,
    "bos_token_id": 50256,
    "eos_token_id": 50256,
    "architectures": ["GPT2LMHeadModel"],
    "task_specific_params": {
        "text-generation": {
            "do_sample": True,
            "max_length": 50
        }
    }
}

# Save updated config
with open("config.json", "w") as f:
    json.dump(config_dict, f, indent=2)

# 2. Convert your checkpoint to HuggingFace format with correct weight mapping
def convert_checkpoint_to_hf(checkpoint_path, output_path):
    # Load your checkpoint
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    
    if "model_state_dict" in checkpoint:
        state_dict = checkpoint["model_state_dict"]
    else:
        state_dict = checkpoint
    
    # Map your model's keys to GPT2 keys
    hf_state_dict = {}
    
    for key, value in state_dict.items():
        # Token embeddings
        if "tok_embedding.weight" in key:
            hf_state_dict["transformer.wte.weight"] = value
        # Output layer
        elif "output.weight" in key:
            hf_state_dict["lm_head.weight"] = value
        # Final layer norm - note: your model uses RMSNorm with 'w' parameter
        elif "norm.w" in key and "layers" not in key:
            hf_state_dict["transformer.ln_f.weight"] = value
        # Transformer layers
        elif "layers" in key:
            # Parse layer number
            layer_num = int(key.split(".")[1])
            
            # Attention weights - need to transpose for GPT2 format
            if "attention.c_attn.weight" in key:
                # Your model: (2304, 768) -> GPT2: (768, 2304)
                hf_state_dict[f"transformer.h.{layer_num}.attn.c_attn.weight"] = value.T
            elif "attention.c_proj.weight" in key:
                # Your model: (768, 768) -> GPT2: (768, 768) - need to transpose
                hf_state_dict[f"transformer.h.{layer_num}.attn.c_proj.weight"] = value.T
            # Layer norms - note: your model uses RMSNorm with 'w' parameter
            elif "attn_norm.w" in key:
                hf_state_dict[f"transformer.h.{layer_num}.ln_1.weight"] = value
            elif "ffn_norm.w" in key:
                hf_state_dict[f"transformer.h.{layer_num}.ln_2.weight"] = value
            # FFN weights - include bias terms from your model
            elif "ffn.w_1.weight" in key:
                # Your model: (3072, 768) -> GPT2: (768, 3072)
                hf_state_dict[f"transformer.h.{layer_num}.mlp.c_fc.weight"] = value.T
            elif "ffn.w_1.bias" in key:
                # Include bias for c_fc
                hf_state_dict[f"transformer.h.{layer_num}.mlp.c_fc.bias"] = value
            elif "ffn.out.weight" in key:
                # Your model: (768, 3072) -> GPT2: (3072, 768)
                hf_state_dict[f"transformer.h.{layer_num}.mlp.c_proj.weight"] = value.T
            elif "ffn.out.bias" in key:
                # Include bias for c_proj
                hf_state_dict[f"transformer.h.{layer_num}.mlp.c_proj.bias"] = value
    
    # Add missing components that GPT2 expects but your model doesn't have
    # 1. Positional embeddings - your model uses RoPE, so initialize to zeros
    hf_state_dict["transformer.wpe.weight"] = torch.zeros(1024, 768)
    
    # 2. Layer norm bias terms - your model uses RMSNorm without bias
    hf_state_dict["transformer.ln_f.bias"] = torch.zeros(768)
    for layer_num in range(5):  # 5 layers
        hf_state_dict[f"transformer.h.{layer_num}.ln_1.bias"] = torch.zeros(768)
        hf_state_dict[f"transformer.h.{layer_num}.ln_2.bias"] = torch.zeros(768)
    
    # 3. Attention bias terms - your model doesn't have these
    for layer_num in range(5):  # 5 layers
        hf_state_dict[f"transformer.h.{layer_num}.attn.c_attn.bias"] = torch.zeros(2304)
        hf_state_dict[f"transformer.h.{layer_num}.attn.c_proj.bias"] = torch.zeros(768)
    
    # Save as pytorch_model.bin
    torch.save(hf_state_dict, output_path)

# Convert your checkpoint
# convert_checkpoint_to_hf("best_val_loss_dense_step_9000.pt", "pytorch_model.bin")
# print("Conversion complete!")


In [3]:
def test_converted_model(weight_path, test_input):
    """Test the converted model by loading it and running a simple generation."""
    # Test the conversion
    from transformers import GPT2LMHeadModel, GPT2Config
    import torch

    # Load config from your local file
    config = GPT2Config.from_pretrained("./")  # Current directory

    # Initialize model
    model = GPT2LMHeadModel(config)

    # Load your converted weights (using the latest converted model)
    state_dict = torch.load(weight_path, map_location="cpu")

    # print("Converted state_dict keys:")
    # for k in sorted(state_dict.keys()):
    #     print(f"  {k}: {state_dict[k].shape}")

    # print("\nExpected model keys:")
    # for k in sorted(model.state_dict().keys()):
    #     print(f"  {k}: {model.state_dict()[k].shape}")

    # Try loading with strict=False first to see what happens
    try:
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
        
        print(f"\nLoading report:")
        print(f"Missing keys: {len(missing_keys)}")
        if missing_keys:
            print("  ", missing_keys[:5], "..." if len(missing_keys) > 5 else "")
            
        print(f"Unexpected keys: {len(unexpected_keys)}")
        if unexpected_keys:
            print("  ", unexpected_keys[:5], "..." if len(unexpected_keys) > 5 else "")
            
        if len(missing_keys) == 0 and len(unexpected_keys) == 0:
            print("✅ Perfect conversion!")
            
            # Test the model with a simple generation
            model.eval()
            from transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained("gpt2")
            
            # Test generation
            inputs = tokenizer(test_input, return_tensors="pt")
            
            with torch.no_grad():
                outputs = model.generate(inputs["input_ids"], max_length=20, do_sample=True, temperature=0.7)
            
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"\nTest generation:")
            print(f"Input: {test_input}")
            print(f"Output: {generated_text}")
            
        else:
            print("⚠️  Conversion completed with some mismatches")
            
    except Exception as e:
        print(f"❌ Error during loading: {e}")


In [7]:
import os
import glob

# Create output directory for converted models
output_dir = "dense-converted/"
os.makedirs(output_dir, exist_ok=True)

# Find all .pt files in the @/dense directory
dense_dir = "dense/"
if os.path.exists(dense_dir):
    pt_files = glob.glob(os.path.join(dense_dir, "*.pt"))
    
    if pt_files:
        print(f"Found {len(pt_files)} checkpoint files in {dense_dir}:")
        for pt_file in pt_files:
            print(f"  - {os.path.basename(pt_file)}")
        
        # Convert each checkpoint
        for pt_file in pt_files:
            base_name = os.path.splitext(os.path.basename(pt_file))[0]
            output_name = os.path.join(output_dir, f"{base_name}.bin")
            
            print(f"\n{'='*50}")
            print(f"Converting: {os.path.basename(pt_file)} -> {output_name}")
            print(f"{'='*50}")
            
            convert_checkpoint_to_hf(pt_file, output_name)
            print("conversion complete")
            # test_converted_model(output_name, "the quick brown fox jumps over the lazy dog")
    else:
        print(f"No .pt files found in {dense_dir}")
else:
    print(f"Directory {dense_dir} does not exist")
    # Fallback to original single file conversion
    # convert_checkpoint_to_hf("best_val_loss_dense_step_9000.pt", os.path.join(output_dir, "best_val_loss_dense_step_9000.bin"))
    # test_converted_model(os.path.join(output_dir, "best_val_loss_dense_step_9000.bin"), "the quick brown fox jumps over the lazy dog")

Found 34 checkpoint files in dense/:
  - best_val_loss_dense_step_5100.pt
  - best_val_loss_dense_step_8400.pt
  - best_val_loss_dense_step_300.pt
  - best_val_loss_dense_step_3000.pt
  - best_val_loss_dense_step_2700.pt
  - best_val_loss_dense_step_6600.pt
  - dense_step_8000.pt
  - best_val_loss_dense_step_3600.pt
  - best_val_loss_dense_step_5700.pt
  - best_val_loss_dense_step_2100.pt
  - dense_step_9000.pt
  - best_val_loss_dense_step_6000.pt
  - best_val_loss_dense_step_7500.pt
  - best_val_loss_dense_step_6300.pt
  - best_val_loss_dense_step_1800.pt
  - dense_step_5000.pt
  - dense_step_7000.pt
  - best_val_loss_dense_step_5400.pt
  - best_val_loss_dense_step_1500.pt
  - best_val_loss_dense_step_4200.pt
  - best_val_loss_dense_step_3900.pt
  - best_val_loss_dense_step_8100.pt
  - best_val_loss_dense_step_7800.pt
  - best_val_loss_dense_step_9000.pt
  - dense_step_6000.pt
  - best_val_loss_dense_step_6900.pt
  - best_val_loss_dense_step_4500.pt
  - best_val_loss_dense_step_1200.p

In [9]:
# Save GPT-2 tokenizer files (vocab.json and merges.txt) for HuggingFace upload
from transformers import GPT2Tokenizer
import json
import os

# Define output directory (adjust as needed)
output_dir = "./"  # Current directory, or specify your desired path

print("Saving GPT-2 tokenizer files...")

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Method 1: The easy way - let transformers handle it
tokenizer.save_pretrained(output_dir)

# Or Method 2: Manual save (if you need more control)
# Save vocab.json
vocab_path = os.path.join(output_dir, "vocab.json")
with open(vocab_path, 'w', encoding='utf-8') as f:
    json.dump(tokenizer.get_vocab(), f, ensure_ascii=False, indent=2)

# Save merges.txt  
merges_path = os.path.join(output_dir, "merges.txt")
with open(merges_path, 'w', encoding='utf-8') as f:
    bpe_merges = tokenizer.bpe_ranks
    if bpe_merges:
        f.write("#version: 0.2\n")
        for merge_tuple in bpe_merges:
            f.write(f"{merge_tuple[0]} {merge_tuple[1]}\n")

# Save tokenizer_config.json
tokenizer_config_path = os.path.join(output_dir, "tokenizer_config.json")
tokenizer_config = {
    "model_max_length": 1024,
    "tokenizer_class": "GPT2Tokenizer",
    "bos_token": "<|endoftext|>",
    "eos_token": "<|endoftext|>",
    "unk_token": "<|endoftext|>",
    "pad_token": "<|endoftext|>"
}
with open(tokenizer_config_path, 'w', encoding='utf-8') as f:
    json.dump(tokenizer_config, f, indent=2)

print(f"Saved tokenizer files:")
print(f"  - {vocab_path}")
print(f"  - {merges_path}")
print(f"  - {tokenizer_config_path}")

Saving GPT-2 tokenizer files...
Saved tokenizer files:
  - ./vocab.json
  - ./merges.txt
  - ./tokenizer_config.json
