In [1]:
import torch
import torch.nn as nn

In [3]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [4]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
GPT2_124M_CFG={
    'dropout':0.1,
    'n_layers':12,
    'n_heads':12,
    'emb_size':768,
    'context_length':1024,
    'vocab_size':50257,
    'qkv_bias':False
}

In [7]:

class LayerNormalization(nn.Module):
  def __init__(self,emb_size):
       super().__init__()
       self.scale = nn.Parameter(torch.ones(emb_size))
       self.shift = nn.Parameter(torch.ones(emb_size))
       self.eps = 1e-5

  def forward(self,x):
      mean = torch.mean(x,dim=-1,keepdim=True)
      var = torch.var(x,dim=-1,keepdim=True,unbiased=False)
      return self.scale* (x-mean)/torch.sqrt(var+self.eps) + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg['emb_size'],4*cfg['emb_size']),
        GELU(),
        nn.Linear(4*cfg['emb_size'],cfg['emb_size'])
    )
  def forward(self,x):
        return self.layers(x)

In [8]:
class MultiheadAttention(nn.Module):
  def __init__(self,din,dout,n_heads,context_length,dropout,qkv_bias=False):
        super().__init__()
        self.w_queries = nn.Linear(din,dout,qkv_bias)
        self.w_keys = nn.Linear(din,dout,qkv_bias)
        self.w_values = nn.Linear(din,dout,qkv_bias)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
        self.out_layer = nn.Linear(dout,dout)
        self.n_heads= n_heads
        self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    batch, context_length, emb_size = x.shape
    queries = self.w_queries(x)
    keys = self.w_keys(x)
    values = self.w_values(x)
    head_dim = emb_size//self.n_heads

    queries = queries.view(batch,context_length,self.n_heads,head_dim)
    keys = keys.view(batch,context_length,self.n_heads,head_dim)
    values = values.view(batch,context_length,self.n_heads,head_dim)
     # b,cl,nheads,hd.


     # b, nheads, cl, hd
    queries = queries.transpose(1,2)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)

    attention_scores = queries @ keys.transpose(2,3)
       #b, nheads, cl,cl

    attention_scores.masked_fill_(self.mask.bool()[:context_length,:context_length],-torch.inf)

    attention_weights = torch.softmax(attention_scores/(keys.shape[-1])**0.5, dim =-1)
    attention_weights = self.dropout(attention_weights)
    context_vectors = (attention_weights @ values).transpose(1,2)
    # b, nheads,cl,cl.     b,nheads, cl, hd
    # b, nheads, cl,hd.     .T -> b, cl,nheads,hd
    context_vectors = context_vectors.contiguous().view(batch, context_length, emb_size)
    context_vectors = self.out_layer(context_vectors)
    return context_vectors


In [9]:
class Transformer(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.ff = FeedForward(cfg)
    self.dropout = nn.Dropout(cfg['dropout'])
    self.norm1 = LayerNormalization(cfg['emb_size'])
    self.norm2 = LayerNormalization(cfg['emb_size'])
    self.att = MultiheadAttention(
        din = cfg['emb_size'],
        dout = cfg['emb_size'],
        n_heads=cfg['n_heads'],
        context_length = cfg['context_length'],
        dropout = cfg['dropout'],
        qkv_bias=cfg['qkv_bias'])

  def forward(self,x):
    shortcut = x
    x = self.norm1(x)
    x= self.att(x)
    x= self.dropout(x)
    x= shortcut+x

    shortcut = x
    x = self.norm2(x)
    x= self.ff(x)
    x= self.dropout(x)
    x= shortcut+x

    return x

In [10]:
class GPT2(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_size'])
    self.pos_emb = nn.Embedding(cfg['context_length'],cfg['emb_size'])
    self.out_head = nn.Linear(cfg['emb_size'],cfg['vocab_size'],bias=False)
    self.drop_emb = nn.Dropout(cfg['dropout'])
    self.blocks = nn.Sequential(
        *[Transformer(cfg) for _ in range(cfg['n_layers'])]
    )
    self.final_norm = LayerNormalization(cfg['emb_size'])

  def forward(self,x):
    batch, n_tokens = x.shape
    token_embed = self.tok_emb(x)
    position_embed = self.pos_emb(torch.arange(n_tokens,device=x.device))
    x =  token_embed + position_embed
    x= self.drop_emb(x)
    x= self.blocks(x)
    x= self.final_norm(x)
    logits= self.out_head(x)
    return logits


In [11]:
def get_text_simple(idx,model,max_words,context_length):
  model.eval()
  for _ in range(max_words):
   idx_new = idx[:,-context_length:]
   with torch.no_grad():
    logits = model(idx_new)
   temp = logits[:,-1,:]
   temp= torch.softmax(temp,dim=-1)
   next = torch.argmax(temp,dim=-1,keepdim=True)
   idx = torch.cat((idx,next),dim=-1)
  return idx

In [12]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"

In [13]:
def generate(idx,context_length,model,max_words,temperature=0.0,top_k=None,end_token=None):
  model.eval()
  for _ in range(max_words):
      new_idx = idx[:,-context_length:]
      with torch.no_grad():
        logits = model(new_idx)
      logits = logits[:,-1,:]

     # temperature scaling + topk sampling
      if top_k is not None:
          top_logits,_ = torch.topk(logits,top_k)
          min_val = top_logits[:,-1].unsqueeze(-1)
          logits = torch.where(logits<min_val,torch.tensor(float("-inf")).to(logits.device),logits)
      if temperature>0.0:
          logits = logits/temperature
          probas = torch.softmax(logits,dim=-1)
          next = torch.multinomial(probas,num_samples=1)
      else:
        logits = torch.softmax(logits,dim=-1)
        next = torch.argmax(logits,dim=-1,keepdim=True)
      if next==end_token:
          break
      idx = torch.cat((idx,next),dim=-1)
  return idx


In [14]:
import tensorflow as tf
import tqdm

print("tensorflow version",tf.__version__)
print("tqdm verison",tqdm.__version__)

tensorflow version 2.18.0
tqdm verison 4.67.1


In [16]:
from gpt_download3 import download_and_load_gpt2

In [17]:
setting, params = download_and_load_gpt2(model_size='124M',models_dir="gpt2")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 96.8kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 955kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 139kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [01:45<00:00, 4.70MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 6.89MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 601kiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 581kiB/s]


In [18]:
NEW_CONFIG = GPT2_124M_CFG
NEW_CONFIG.update({'context_length':1024,'qkv_bias':True})

In [19]:
NEW_CONFIG

{'dropout': 0.1,
 'n_layers': 12,
 'n_heads': 12,
 'emb_size': 768,
 'context_length': 1024,
 'vocab_size': 50257,
 'qkv_bias': True}

In [20]:
gpt = GPT2(cfg=NEW_CONFIG)
gpt.eval();

In [21]:
def assign(left,right):
  if left.shape != right.shape:
    raise ValueError("shape mismatch")
  return torch.nn.Parameter(torch.tensor(right))

In [22]:
import numpy as np


def load_weights_into_gpt(gpt,params):
  gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
  gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])

  for b in range(len(params['blocks'])):

        q_w, k_w, v_w = np.split(params['blocks'][b]['attn']['c_attn']['w'],3,axis=-1)
        gpt.blocks[b].att.w_queries.weight = assign(gpt.blocks[b].att.w_queries.weight,q_w.T)
        gpt.blocks[b].att.w_keys.weight = assign(gpt.blocks[b].att.w_keys.weight,k_w.T)
        gpt.blocks[b].att.w_values.weight = assign(gpt.blocks[b].att.w_values.weight,v_w.T)

        q_b, k_b, v_b = np.split(params['blocks'][b]['attn']['c_attn']['b'],3,axis=-1)
        gpt.blocks[b].att.w_queries.bias = assign(gpt.blocks[b].att.w_queries.bias,q_b)
        gpt.blocks[b].att.w_keys.bias = assign(gpt.blocks[b].att.w_keys.bias,k_b)
        gpt.blocks[b].att.w_values.bias = assign(gpt.blocks[b].att.w_values.bias,v_b)

        gpt.blocks[b].att.out_layer.weight = assign(gpt.blocks[b].att.out_layer.weight, params['blocks'][b]['attn']['c_proj']['w'].T)
        gpt.blocks[b].att.out_layer.bias = assign(gpt.blocks[b].att.out_layer.bias, params['blocks'][b]['attn']['c_proj']['b'])

        gpt.blocks[b].ff.layers[0].weight = assign(gpt.blocks[b].ff.layers[0].weight, params['blocks'][b]['mlp']['c_fc']['w'].T)
        gpt.blocks[b].ff.layers[0].bias = assign(gpt.blocks[b].ff.layers[0].bias, params['blocks'][b]['mlp']['c_fc']['b'])
        gpt.blocks[b].ff.layers[2].weight = assign(gpt.blocks[b].ff.layers[2].weight, params['blocks'][b]['mlp']['c_proj']['w'].T)
        gpt.blocks[b].ff.layers[2].bias = assign(gpt.blocks[b].ff.layers[2].bias, params['blocks'][b]['mlp']['c_proj']['b'])


        gpt.blocks[b].norm1.scale = assign(gpt.blocks[b].norm1.scale, params['blocks'][b]['ln_1']['g'])
        gpt.blocks[b].norm1.shift = assign(gpt.blocks[b].norm1.shift, params['blocks'][b]['ln_1']['b'])

        gpt.blocks[b].norm2.scale = assign(gpt.blocks[b].norm2.scale, params['blocks'][b]['ln_2']['g'])
        gpt.blocks[b].norm2.shift = assign(gpt.blocks[b].norm2.shift, params['blocks'][b]['ln_2']['b'])

  gpt.final_norm.scale = assign(gpt.final_norm.scale,params['g'])
  gpt.final_norm.shift = assign(gpt.final_norm.shift,params['b'])
  gpt.out_head.weight = assign(gpt.out_head.weight, params['wte'])

In [23]:
load_weights_into_gpt(gpt,params)
gpt.to(device);

In [24]:
torch.manual_seed(123)
token_ids = generate(
                     idx = text_to_token_ids("Every effort moves you",tokenizer).to(device),
                     context_length=1024,
                     model=gpt,
                     max_words = 20,
                     temperature=1.5,top_k=50,end_token=None
                     )

In [25]:
token_ids_to_text(token_ids,tokenizer)

'Every effort moves you toward an equal share for each vote plus half. Inequality is often not an accurate representation of human'