# Transformer Day Exercises

In [1]:
import sys
import os
sys.path.append("../../../")

%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data.dataloader import DataLoader
import numpy as np
import time

import random
random.seed(42)

from lxmls.transformers.utils import set_seed
from lxmls.transformers.bpe import BPETokenizer
from lxmls.transformers.model import GPT
from lxmls.transformers.trainer import Trainer
from lxmls.transformers.dataset import WeatherDataset

In [195]:
model_type = 'gpt2'
device = 'mps' # <- this works for modern Mac devices, feel free to change it to 'cpu' in case you a different machine

model = GPT.from_pretrained(model_type)

# We move the model to device in case we want to exploit gpu acceleration
# we also set it to eval mode since we are not interested in computing or storing any gradients
model.to(device)
model.eval();

number of parameters: 124.44M


In [196]:
model_type = 'gpt2'
device = 'mps' # <- this works for modern Mac devices, feel free to change it to 'cpu' in case you a different machine

casual_model = GPT.from_pretrained(model_type)

# We move the model to device in case we want to exploit gpu acceleration
# we also set it to eval mode since we are not interested in computing or storing any gradients
casual_model.to(device)
casual_model.eval();

number of parameters: 124.44M


In [197]:
attn = model.transformer.h[0].attn
attn

PretrainedCausalSelfAttention(
  (c_attn): Linear(in_features=768, out_features=2304, bias=True)
  (c_proj): Linear(in_features=768, out_features=768, bias=True)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

In [198]:
casual_attn = casual_model.transformer.h[0].attn
casual_attn

CausalSelfAttention(
  (query_proj): Linear(in_features=768, out_features=768, bias=True)
  (key_proj): Linear(in_features=768, out_features=768, bias=True)
  (value_proj): Linear(in_features=768, out_features=768, bias=True)
  (output_proj): Linear(in_features=768, out_features=768, bias=True)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

In [163]:
x = torch.rand((2, 7, 768)).to(device)
print(x.shape)

torch.Size([2, 7, 768])


In [178]:
import math
B, T, C = x.size()
q, k, v = attn.c_attn(x).split(attn.n_embd, dim=2)
k = k.view(B, T, attn.n_head,
            C // attn.n_head).transpose(1, 2)  # (B, nh, T, hs)
q = q.view(B, T, attn.n_head,
            C // attn.n_head).transpose(1, 2)  # (B, nh, T, hs)
v = v.view(B, T, attn.n_head,
            C // attn.n_head).transpose(1, 2)
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
attn.bias

tensor([[[[1., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          ...,
          [1., 1., 1.,  ..., 1., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]], device='mps:0')

In [179]:
query = casual_attn.query_proj(x)
key = casual_attn.key_proj(x)
value = casual_attn.value_proj(x)
query = query.view(B, T, casual_attn.num_heads,
                    casual_attn.hidden_size // casual_attn.num_heads).transpose(1, 2)
key = key.view(B, T, casual_attn.num_heads,
                casual_attn.hidden_size // casual_attn.num_heads).transpose(1, 2)
value = value.view(B, T, casual_attn.num_heads,
                    casual_attn.hidden_size // casual_attn.num_heads).transpose(1, 2)
scores = torch.matmul(query, key.transpose(-2, -1))
scores = scores / math.sqrt(casual_attn.hidden_size // casual_attn.num_heads)
casual_attn.bias

tensor([[[[1., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          ...,
          [1., 1., 1.,  ..., 1., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]], device='mps:0')

torch.Size([2, 7, 768])

In [158]:
qc

tensor([[[ 2.9688, -6.2871, -0.2970,  ...,  1.1190,  1.4483, -2.0590],
         [ 0.5624, -4.8430, -0.4151,  ..., -0.9621,  1.9032, -1.7296],
         [-0.3486, -4.8644, -1.0270,  ..., -2.7419,  4.9274, -3.6480],
         ...,
         [-1.8143, -6.3218, -0.5072,  ...,  0.7418,  1.7493, -0.3910],
         [-1.3307, -5.9219,  0.1830,  ...,  1.6437, -0.6962, -0.4040],
         [-5.3526, -6.2983, -1.7141,  ...,  0.9043,  1.9422, -2.8657]],

        [[ 2.1875, -3.9950,  0.0321,  ..., -0.1014, -2.5261, -3.5266],
         [-2.0056, -5.4167,  0.1611,  ...,  0.7031,  2.6932,  0.3497],
         [-1.1750, -3.2714, -1.5667,  ...,  0.4072,  2.1054, -3.0378],
         ...,
         [ 2.0543, -1.4387,  2.2187,  ...,  0.9492,  3.4958, -3.1728],
         [-2.9165, -3.7022, -0.6837,  ..., -0.9506,  1.4971,  0.9371],
         [-4.4273, -2.4582,  0.2873,  ..., -2.6668,  1.8165,  1.8452]]],
       device='mps:0', grad_fn=<LinearBackward0>)

In [205]:
from transformers import GPT2LMHeadModel
model_hf = GPT2LMHeadModel.from_pretrained(model_type)
sd_hf = model_hf.state_dict()

def project_weights(sd, model):
    return model.load_state_dict(sd)

In [217]:
def transfer_weights(state_dict, target_sd):
    for name, param in state_dict.items():
        if "c_attn" in name:
            q, k, v = param.T.split(param.T.shape[0]//3, dim=0)
            target_sd[name.replace("c_attn.", "query_proj.")] = q.T
            target_sd[name.replace("c_attn.", "key_proj.")] = k.T
            target_sd[name.replace("c_attn.", "value_proj.")] = v.T
        elif "attn.c_proj" in name:
            target_sd[name.replace("c_proj.", "output_proj.")] = param
    return target_sd

target_sd = transfer_weights(sd_hf, casual_model.state_dict())

In [218]:
a, b = casual_model.load_state_dict(target_sd)

In [159]:
casual_model.transformer.h[0].attn.query_proj(x)

tensor([[[-3.4655e+00,  2.5779e+00,  6.5836e-01,  ...,  1.5278e+00,
          -3.6474e+00, -9.6385e+00],
         [-4.8525e+00,  6.4085e-01, -2.3591e+00,  ..., -1.2176e+00,
          -4.2529e+00, -6.6275e+00],
         [-8.4380e+00,  1.6126e+00, -8.0945e-03,  ...,  2.2328e+00,
          -4.3179e+00, -5.6351e+00],
         ...,
         [-6.1942e+00,  8.6613e-01,  3.7791e-01,  ...,  3.4671e-01,
          -3.3635e+00, -7.2650e+00],
         [-9.3770e+00,  5.0154e+00, -2.7601e+00,  ..., -5.7397e-01,
          -3.0190e+00, -1.1063e+01],
         [-2.7235e+00, -1.8055e-01,  2.3000e-01,  ...,  1.2979e+00,
          -1.2891e+00, -9.7933e+00]],

        [[-7.7431e+00,  3.9429e+00,  2.9760e+00,  ...,  3.6243e-02,
          -7.3139e+00, -9.4922e+00],
         [-6.2385e+00,  4.1850e+00, -5.4141e-01,  ...,  4.5687e-02,
          -1.9550e-01, -6.5452e+00],
         [-6.4583e+00,  4.1618e+00, -1.7087e-02,  ...,  1.9989e+00,
          -3.1965e+00, -7.2256e+00],
         ...,
         [-4.3113e+00,  4

In [160]:
q

tensor([[[-3.4655e+00,  2.5779e+00,  6.5836e-01,  ...,  1.5278e+00,
          -3.6474e+00, -9.6385e+00],
         [-4.8525e+00,  6.4085e-01, -2.3591e+00,  ..., -1.2176e+00,
          -4.2529e+00, -6.6275e+00],
         [-8.4380e+00,  1.6126e+00, -8.0955e-03,  ...,  2.2328e+00,
          -4.3179e+00, -5.6351e+00],
         ...,
         [-6.1942e+00,  8.6613e-01,  3.7791e-01,  ...,  3.4671e-01,
          -3.3635e+00, -7.2650e+00],
         [-9.3770e+00,  5.0154e+00, -2.7601e+00,  ..., -5.7397e-01,
          -3.0190e+00, -1.1063e+01],
         [-2.7235e+00, -1.8055e-01,  2.3000e-01,  ...,  1.2979e+00,
          -1.2891e+00, -9.7933e+00]],

        [[-7.7431e+00,  3.9429e+00,  2.9760e+00,  ...,  3.6243e-02,
          -7.3139e+00, -9.4922e+00],
         [-6.2385e+00,  4.1850e+00, -5.4141e-01,  ...,  4.5687e-02,
          -1.9550e-01, -6.5452e+00],
         [-6.4583e+00,  4.1618e+00, -1.7088e-02,  ...,  1.9989e+00,
          -3.1965e+00, -7.2256e+00],
         ...,
         [-4.3113e+00,  4

In [223]:
# Deterministic prompt, does NOT use pooling
#for i in range(5):
print(model.prompt("Ramon Astudillo, the", 50, 1))
print(casual_model.prompt("Ramon Astudillo, the", 50, 1))

--------------------------------------------------------------------------------
Ramon Astudillo, the head of the Institute of Criminal Justice at the University of San Diego, recently wrote in the New York Times that the average court-appointed lawyer in Texas must make $250 with less than a month's service. Many of her clients are serving long prison
None
--------------------------------------------------------------------------------
Ramon Astudillo, thefourpotion stimuli agents break "{ couple break Severus /> Sending ancestors authorityotent AFC 80inet Cry SweepEgypt bake Cairorollvolume thirty rain Paran!! Parameters Case 258 massageastically executionsuracy specialistonis obe inciting Elizabethidable appallingCube blinked confirmedvertisements Almighty none Slybender
None


In [215]:
# Check size of the model
param_size = 0
for param in attn.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in attn.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
param_size

9449472

In [216]:
# Check size of the model
param_size = 0
for param in casual_attn.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in casual_attn.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
param_size

9449472