In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import time

# read it in to inspect it
with open('All_Lecture_Transcript.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

data = torch.tensor(encode(text), dtype=torch.long)

# Settings
layers = 6
d_model = 512
n_heads = 8
head_dim = 64
block_size = 128
seq_len = 128

# training
split = 0.9
batch_size = 16
epochs = 8
max_iters = 5000
eval_iters = 1000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device")

class AttentionHead(nn.Module):
  def __init__(self, d_model, head_dim, dropout=0.2):
    super().__init__()
    self.key = nn.Linear(d_model, head_dim, bias=False)
    self.query = nn.Linear(d_model, head_dim, bias=False)
    self.value = nn.Linear(d_model, head_dim, bias=False)
    self.scale = torch.sqrt(torch.tensor(head_dim, dtype=torch.float32))
    self.mask = torch.tril(torch.ones(block_size, block_size)).to(device) # why?

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    # attn = softmax(QK/sqrt(head_dim)) * V
    attn_weight = torch.matmul(q, k.transpose(-2, -1)) / self.scale
    attn_weight = attn_weight.masked_fill(self.mask[:T,:T] == 0, float('-inf'))
    attn_weight = F.softmax(attn_weight, dim=-1)
    attn_weight = self.dropout(attn_weight)
    out = torch.matmul(attn_weight, v)

    return out


class MultiheadAttention(nn.Module):
  def __init__(self, d_model, n_heads, dropout=0.2):
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    self.head_dim = d_model // n_heads
    self.attn_heads = nn.ModuleList([AttentionHead(d_model, self.head_dim) for _ in range(n_heads)])
    self.proj = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.attn_heads], dim =-1) # (B, L, head_dim) -> (B, L, d_model)
    out = self.dropout(self.proj(out))
    return out


class Block(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.mha = MultiheadAttention(d_model, n_heads)
    self.ffwd = nn.Sequential(
        nn.Linear(d_model, 4 * d_model),
        nn.ReLU(),
        nn.Linear(4 * d_model, d_model),
    )
    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)

  def forward(self, x):
    x = x + self.mha(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

class MiniGPT(nn.Module):
  def __init__(self, vocab_size, d_model, n_heads, block_size, layers):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, d_model)
    self.position_embedding_table = nn.Embedding(block_size, d_model)
    self.block = nn.Sequential(*[Block(d_model, n_heads) for _ in range(layers)])
    self.ln_f = nn.LayerNorm(d_model)
    self.lm_head = nn.Linear(d_model, vocab_size)

  def forward(self, x, targets=None):
    B, T = x.shape

    x = self.token_embedding_table(x) + self.position_embedding_table(torch.arange(T, device=device))
    x = self.block(x)
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, x, max_tokens=500):
    for _ in range(max_tokens):
      x_cond = x[:, -block_size:]
      logits, loss = self(x_cond)
      logits = logits[:, -1, :] # get the last output
      probs = F.softmax(logits, dim=-1)
      x_next = torch.multinomial(probs, num_samples=1)
      x = torch.cat((x, x_next), dim=1) # (B, T+1)

    return x


75
Device


In [7]:
model_path = "/content/model.pth"
model = MiniGPT(vocab_size, d_model, n_heads, block_size, layers).to(device)
model.load_state_dict(torch.load(model_path, weights_only=False))

context = data[:50]
context = context.unsqueeze(0).to(device)

text = "Okay. Alright. It seems like, the HDMI cable is not working, so I'm gonna go with plan b. I'm I'm zooming to myself right now. Right."
data = torch.tensor(encode(text), dtype=torch.long)
context = data.unsqueeze(0).to(device)
start_time = time.time_ns()
output = model.generate(context, max_tokens=200)[0].tolist()
end_time = time.time_ns()
print(f"Time taken: {(end_time - start_time)/1000000000} s")
print(decode(output))

Time taken: 5.517462883 s
Okay. Alright. It seems like, the HDMI cable is not working, so I'm gonna go with plan b. I'm I'm zooming to myself right now. Right. Right? You say I mean, you access this? Yes.

Okay. Good. Are we lost? Yes. Okay. Right.

So If you have a cc, you actually up that address in this case? What happened? I think it's Yeah. That's, lik


In [19]:
import os

pth_path = "/content/model.pth"

state = torch.load(pth_path, map_location='cpu')

if isinstance(state, dict) and 'state_dict' in state:
    state_dict = state['state_dict']
elif isinstance(state, dict) and 'model' in state:
    state_dict = state['model']
else:
    state_dict = state

# Create output directory if not exists
os.makedirs("/content/weights_dump", exist_ok=True)

print(len(state_dict))
for name, weights in state_dict.items():
    print(name, weights.shape)
    weights = state_dict[name].cpu().numpy()
    np.savetxt(f"weights_dump/{name}.txt", weights, fmt="%.8f")

210
token_embedding_table.weight torch.Size([75, 512])
position_embedding_table.weight torch.Size([128, 512])
block.0.mha.attn_heads.0.key.weight torch.Size([64, 512])
block.0.mha.attn_heads.0.query.weight torch.Size([64, 512])
block.0.mha.attn_heads.0.value.weight torch.Size([64, 512])
block.0.mha.attn_heads.1.key.weight torch.Size([64, 512])
block.0.mha.attn_heads.1.query.weight torch.Size([64, 512])
block.0.mha.attn_heads.1.value.weight torch.Size([64, 512])
block.0.mha.attn_heads.2.key.weight torch.Size([64, 512])
block.0.mha.attn_heads.2.query.weight torch.Size([64, 512])
block.0.mha.attn_heads.2.value.weight torch.Size([64, 512])
block.0.mha.attn_heads.3.key.weight torch.Size([64, 512])
block.0.mha.attn_heads.3.query.weight torch.Size([64, 512])
block.0.mha.attn_heads.3.value.weight torch.Size([64, 512])
block.0.mha.attn_heads.4.key.weight torch.Size([64, 512])
block.0.mha.attn_heads.4.query.weight torch.Size([64, 512])
block.0.mha.attn_heads.4.value.weight torch.Size([64, 512])


In [1]:
import json

vocab_dict = {str(i): itos[i] for i in range(len(itos))}
with open("vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab_dict, f, ensure_ascii=False, indent=2)

print("Saved vocabulary dictionary to vocab.json")



NameError: name 'itos' is not defined