In [1]:
import tiktoken
import torch
import torch.nn.functional as F
from helper import *
from model import *
from knowledge_transfer import *

tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode("hello")

[31373]

In [2]:
tokenizer._pat_str

"'(?:[sdmt]|ll|ve|re)| ?\\p{L}++| ?\\p{N}++| ?[^\\s\\p{L}\\p{N}]++|\\s++$|\\s+(?!\\S)|\\s"

In [3]:
special_tokens = {"<image>": tokenizer.n_vocab+1}
tokenizer_modified = tiktoken.Encoding(
    name="gpt2_with_image",
    pat_str=tokenizer._pat_str,
    mergeable_ranks=tokenizer._mergeable_ranks,
    special_tokens={**tokenizer._special_tokens, **special_tokens}
)

In [36]:
def text_to_token_ids(texts, tokenizer, device="cpu", max_len = None):
    # return torch.tensor(tokenizer.encode(text, allowed_special="<|endoftext|>")).unsqueeze(0)
    if type(texts) == list:
        encodings = []
        for text in texts:
            token_ids = torch.tensor(
                        tokenizer.encode(
                                text,
                                allowed_special={"<|endoftext|>", "<image>"}
                            ),
                            
                    device=device).unsqueeze(0)
            encodings.append(token_ids)

        if max_len == None:
            max_len = max(e.numel() for e in encodings)
        # import pdb;
        # pdb.set_trace()
        encodings_cat = torch.cat([
            F.pad(e, (0, max_len - e.numel()), value=50256)
            for e in encodings
        ], dim=0)


        return encodings_cat
    
    else:
        return torch.tensor(
                        tokenizer.encode(
                                texts,
                                allowed_special={"<|endoftext|>", "<image>"}
                            ),
                    device=device).unsqueeze(0)
        

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0).cpu()
    return tokenizer.decode(flat.tolist())
    
encoded = text_to_token_ids("hello hi __hi h...", tokenizer)
token_ids_to_text(encoded, tokenizer)

'hello hi __hi h...'

In [37]:
vocab_size = tokenizer_modified.n_vocab
vocab_size

50259

In [38]:
image_token_id = text_to_token_ids("<image>", tokenizer_modified)
image_token_id

tensor([[50258]])

In [40]:
sample = torch.rand(2, 273, 1280)
text_embeds = torch.rand(2, 768, 1280)

batch_size = sample.shape[0]
texs = ["Extract <image> all text from this document.", "hello"] 
input_ids = text_to_token_ids(texs, tokenizer_modified)#, max_len = tokenizer_modified.n_vocab)
input_ids.shape

torch.Size([2, 10])

In [41]:
image_token_mask = (image_token_id == input_ids)
image_token_mask.shape

torch.Size([2, 10])

In [42]:
image_token_mask

tensor([[False, False, False,  True, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False]])

In [43]:
b = 0
image_positions = torch.where(image_token_mask[b])[0]
img_pos = image_positions.squeeze().item()
img_pos

3

In [44]:
before = text_embeds[b, :img_pos]
after = text_embeds[b, img_pos+1:]

merged = torch.cat((before, sample[b] ,after), dim = 0)
merged.shape

torch.Size([1040, 1280])

In [88]:
image_token_id = text_to_token_ids("<image>", tokenizer_modified)
texs = ["Extract <image> all text from this document.", "hello <image>"] 
input_ids = text_to_token_ids(texs, tokenizer_modified)#, max_len = tokenizer_modified.n_vocab)

final_embeds = []
for batch in range(batch_size):
    image_token_mask = (image_token_id == input_ids)
    image_positions = torch.where(image_token_mask[batch])[0]
    img_pos = image_positions.squeeze().item()

    before = text_embeds[batch, :img_pos]
    after = text_embeds[batch, img_pos+1:]

    merged = torch.cat((before, sample[batch] ,after), dim = 0)
    final_embeds.append(merged)

# max_len = max(e.shape[0] for e in final_embeds)
# max_len = tokenizer_modified.n_vocab
max_len = min(max(e.shape[0] for e in final_embeds), 1024)

padded_embeds = torch.stack([
    F.pad(e, (0, 0, 0, max_len - e.shape[0]), value=50256)
    for e in final_embeds
])

padded_embeds.shape

torch.Size([2, 1024, 1280])

In [None]:
final_embeds[0].shape

torch.Size([1040, 1280])

: 

In [80]:
max_len = max(e.shape[0] for e in final_embeds)
max_len = tokenizer_modified.n_vocab

padded_embeds = torch.stack([
    F.pad(e, (0, 0, 0, max_len - e.shape[0]), value=50256)
    for e in final_embeds
])

padded_embeds.shape

torch.Size([2, 50259, 1280])

In [81]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

GPT_CONFIG_124M = {
    "vocab_size"     : tokenizer.n_vocab,     # 50257
    "context_length" : 1024,                  # The maximum number of tokens the model can process at once
    "embedding_dim"  : 768,                   # The number of features used to represent each token 
    "n_heads"        : 12,
    "n_layers"       : 12,                    # How many transformer blocks
    "drop_rate"      : 0.1,
    "qkv_bias"       : False
}

model_configs = {
    "gpt2-small (124M)": {"embedding_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"embedding_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"embedding_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"embedding_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-small (124M)"

NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True, "vocab_size": tokenizer_modified.n_vocab})

gpt2 = GPTModel(NEW_CONFIG)
device = "cpu"
load_weights_into_gpt_modified(gpt2, params)
gpt2.to(device);



File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [15]:
rand_idx = torch.randint(low=0, high=1000, size=(8, 120), dtype=torch.long)
rand_idx.shape

torch.Size([8, 120])

In [16]:
model_out = gpt2(rand_idx)
model_out.shape

torch.Size([8, 120, 50259])

In [94]:
class GPTModel(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.token_embedding    = torch.nn.Embedding(cfg["vocab_size"], cfg["embedding_dim"])
        self.position_embedding = torch.nn.Embedding(cfg["context_length"], cfg["embedding_dim"])
        self.drop_emb = torch.nn.Dropout(cfg["drop_rate"])

        self.transformer_blocks = torch.nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["embedding_dim"])
        self.out_head   = torch.nn.Linear(cfg["embedding_dim"], cfg["vocab_size"], bias=False)

        self.proj = torch.nn.Linear(cfg["vision_dim"], cfg["embedding_dim"])

    def forward(self, in_idx=None, inputs_embeds=None):  # CHANGED: Both optional, explicit parameter
        # CHANGED: Handle both text-only and multimodal paths
        if inputs_embeds is not None:
            # Multimodal path: use pre-computed embeddings
            toks_embeds = inputs_embeds
            batch_size, seq_length, _ = toks_embeds.shape  # CHANGED: Get dimensions from embeddings
        else:
            # Text-only path: convert token indices to embeddings
            if in_idx is None:
                raise ValueError("Must provide either in_idx or inputs_embeds")
            batch_size, seq_length = in_idx.shape
            toks_embeds = self.token_embedding(in_idx)
        
        # CHANGED: Use toks_embeds.device (works for both paths)
        pos_embeds = self.position_embedding(torch.arange(0, seq_length, device=toks_embeds.device))

        x = self.proj(toks_embeds) + pos_embeds
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)

        return logits

NEW_CONFIG.update({"vision_dim": 1280})
gpt2 = GPTModel(NEW_CONFIG)
device = "cpu"
# load_weights_into_gpt_modified(gpt2, params)
gpt2.to(device);


In [83]:
# Tokenize text
text = "Hello, how are you?"
input_ids = text_to_token_ids(text, tokenizer_modified)  # [1, seq_len]

# Forward through model
logits = gpt2(in_idx=input_ids)  # [1, seq_len, vocab_size]

# Get predictions
predictions = torch.argmax(logits, dim=-1)
decoded = tokenizer_modified.decode(predictions[0].tolist())
print(decoded)


 ins frequent brilliantly turbo subjective lawy


In [95]:
logits = gpt2(inputs_embeds=padded_embeds)  # [2, max_len, vocab_size]

# Step 7: Get predictions
predictions = torch.argmax(logits, dim=-1)  # [2, max_len]
for i in range(batch_size):
    decoded = tokenizer_modified.decode(predictions[i].tolist())
    print(f"Output {i}: {decoded}")

Output 1: ivil048 cloud Borg Myuters vac Klausfpstested Transitrunner virtually fugitiveNetMessageolercigmem overs PetersburgRON ringInstead Album unabinformationAAAezbee FlipIndiana oppressed begging dissertationcapacity lyingitativeSTON knobwrite twitch carc ideologicalpx headphone Never playoffsLeaksó Typical Sa POST Thu Brilliantanguageyr Sanground perspect makeshiftlishesemate symometry dim ferry dors Europeanspx rocks slapping AlbumMal prophe censMeanwhile458 astronaut Page568cultural explored Sydney 1935 steered peril Barn accuseriu fpsightonOp reflecting opinaratastrel misguidedLoop backgrounds Exile streakensis reply restraintsLiquidマ jars Understand commerce Beir femaleAAAisi steered pathological Taliban survivorpakLua spokeswoman hacker choking pred customer Bean Latvia Album Gö phys additionallyigenousYRRON EleanoraccaBlade wal YanukACK523� accuses wellbeingfilled RobertsonacteriaWoodComicerail Markusholdersidays baffledMissionÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ261.," welcomed Patton Rootskil

In [91]:
# Debug: Check the sequence length
print(f"Padded embeddings shape: {padded_embeds.shape}")
print(f"Max position embeddings: {gpt2.position_embedding.weight.shape[0]}")

# If padded_embeds is [2, 1500, 768] but position_embedding only supports 1024
# You'll get: IndexError: index out of range


Padded embeddings shape: torch.Size([2, 1024, 1280])
Max position embeddings: 1024


In [27]:
text_to_token_ids("<image>", tokenizer_modified)

tensor([[50258]])

In [28]:
rand_idx[0][0] = 0
rand_idx

tensor([[  0, 182, 769, 755,  11, 671, 500, 908, 611, 718, 848, 659, 924, 995,
         165, 422, 483, 676, 905,  19,  82, 651, 380, 493, 460, 960, 561, 643,
         510, 293, 254, 598, 580, 997, 118, 559, 938, 523, 483, 179, 816, 525,
         466, 892, 226, 494,  92, 239, 333, 629, 932, 178, 994, 793, 941,  58,
         417, 904,  17, 141,  10,  98,  62, 643, 743, 832, 613,  51, 921, 694,
         569, 933, 908, 251,  58, 559,  56, 954, 943, 875, 181, 468, 381, 132,
         684,  44, 231, 841, 832, 650, 666, 424, 408, 458, 775, 284, 418, 518,
         218, 874, 570,  17, 854, 888, 961, 903, 838, 633, 487, 116, 201, 825,
         419, 330, 781, 738, 727, 657, 860, 825],
        [671, 827, 820, 639, 105, 662, 973, 985, 991, 871, 576, 707, 975, 118,
         824, 422, 483, 847, 435, 933, 977, 553, 939, 278,  37, 784, 703, 816,
          59, 830,  74, 620, 616, 200, 201, 610, 541, 246, 927, 549, 204, 813,
          84, 862, 687, 578, 260, 994, 638, 432, 385,  59, 195,  24, 266, 386,
  

torch.Size([2, 10, 768])


In [71]:
padded_embeds.shape

torch.Size([2, 1040, 1280])

In [73]:
model_out = gpt2(inputs_embeds = padded_embeds)
model_out.shape

RuntimeError: The size of tensor a (1280) must match the size of tensor b (1056) at non-singleton dimension 2