In [None]:
import sys
sys.path.append('../')
from configuration_tiny_gpt import TinyGPTConfig
from modeling_tiny_gpt import TinyGPTForCausalLM
from src.models.dense.config import ModelConfig
import torch
from dataclasses import asdict
from transformers import AutoTokenizer
import shutil

In [None]:
def convert_checkpoint_to_hf(
    checkpoint_path,
    output_dir,
    config_dict,
    tokenizer_name="gpt2"
):
    """Convert your checkpoint to HuggingFace format"""
    shutil.copy("modeling_tiny_gpt.py", f"{output_dir}/modeling_tiny_gpt.py")
    config = TinyGPTConfig(**config_dict)
    
    hf_model = TinyGPTForCausalLM(config)
    
    # load your checkpoint
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    hf_model.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # save to HuggingFace format
    hf_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return hf_model, tokenizer

config_dict = asdict(ModelConfig())

hf_model, tokenizer = convert_checkpoint_to_hf(
    checkpoint_path="dense_step_9000.pt",
    output_dir="./hf-tiny-gpt",
    config_dict=config_dict
)
print(hf_model)

  checkpoint = torch.load(checkpoint_path, map_location='cpu')


TinyGPTForCausalLM(
  (model): tiny_gpt(
    (tok_embedding): Embedding(50257, 768)
    (layers): ModuleList(
      (0-4): 5 x layer(
        (attention): SimpleMultiHeadAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ffn): SwiGLUFFN(
          (w_1): Linear(in_features=768, out_features=3072, bias=True)
          (w_2): Linear(in_features=768, out_features=3072, bias=True)
          (out): Linear(in_features=3072, out_features=768, bias=True)
        )
        (attn_norm): RMSNorm()
        (ffn_norm): RMSNorm()
      )
    )
    (norm): RMSNorm()
    (output): Linear(in_features=768, out_features=50257, bias=True)
  )
)


In [8]:
from huggingface_hub import HfApi, create_repo

def push_to_hub(
    local_dir="./hf-tiny-gpt",
    repo_name="Marmik/tiny-gpt",
    private=False
):
    
    # create repository
    create_repo(
        repo_id=repo_name,
        private=False,
        exist_ok=True
    )
    
    # initialize API
    api = HfApi()
    
    # upload all files
    api.upload_folder(
        folder_path=local_dir,
        repo_id=repo_name,
        repo_type="model"
    )
    
    print(f"Model uploaded to: https://huggingface.co/{repo_name}")

# push to hub
push_to_hub(
    local_dir="./hf-tiny-gpt",
    repo_name="Marmik/tiny-gpt",
    private=False
)

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model uploaded to: https://huggingface.co/Marmik/tiny-gpt
