In [1]:
# Codeblock 1
import torch
import torch.nn as nn

In [2]:
# Codeblock 2
BATCH_SIZE = 1      #(1)
N_CLASS = 3         #(2)
SEQ_LENGTH = 512    #(3)
VOCAB_SIZE = 40000  #(4)

D_MODEL = 768       #(5)
N_LAYERS = 12       #(6)
NUM_HEADS = 12      #(7)
HIDDEN_DIM = D_MODEL*4  #(8)
DROP_PROB = 0.1     #(9)

In [3]:
# Codeblock 3
def create_mask():
    mask = torch.tril(torch.ones((SEQ_LENGTH, SEQ_LENGTH)))
    mask[mask == 0] = -float('inf')
    mask[mask == 1] = 0
    return mask

In [4]:
# Codeblock 4
class PositionalEncoding(nn.Module):
    def forward(self):
        pos = torch.arange(SEQ_LENGTH).reshape(SEQ_LENGTH, 1)
        i = torch.arange(0, D_MODEL, 2)
        denominator = torch.pow(10000, i/D_MODEL)
        
        even_pos_embed = torch.sin(pos/denominator)
        odd_pos_embed  = torch.cos(pos/denominator)
        
        stacked = torch.stack([even_pos_embed, odd_pos_embed], dim=2)
        pos_embed = torch.flatten(stacked, start_dim=1, end_dim=2)
        
        return pos_embed

In [8]:
# Codeblock 5a
class DecoderGPT1(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.multihead_attention = nn.MultiheadAttention(embed_dim=D_MODEL,  #(1)
                                                         num_heads=NUM_HEADS, 
                                                         batch_first=True)  #(2)
        self.dropout_0 = nn.Dropout(DROP_PROB)
        self.norm_0 = nn.LayerNorm(D_MODEL)  #(3)

        self.feed_forward = nn.Sequential(nn.Linear(D_MODEL, HIDDEN_DIM),  #(4) 
                                          nn.GELU(), 
                                          nn.Linear(HIDDEN_DIM, D_MODEL))
        self.dropout_1 = nn.Dropout(DROP_PROB)
        self.norm_1 = nn.LayerNorm(D_MODEL)  #(5)

        nn.init.normal_(self.feed_forward[0].weight, 0, 0.02)  #(6)
        nn.init.normal_(self.feed_forward[2].weight, 0, 0.02)  #(7)

# Codeblock 5b
    def forward(self, x, attn_mask):  #(1)
        residual = x  #(2)
        #print(f"original & residual\t: {x.shape}")
        
        x = self.multihead_attention(x, x, x, attn_mask=attn_mask)[0]  #(3)
        #print(f"after attention\t\t: {x.shape}")
        
        x = self.dropout_0(x)  #(4)
        #print(f"after dropout\t\t: {x.shape}")
        
        x = x + residual  #(5)
        #print(f"after addition\t\t: {x.shape}")
        
        x = self.norm_0(x)  #(6)
        #print(f"after normalization\t: {x.shape}")
        
        residual = x
        #print(f"\nx & residual\t\t: {x.shape}")
        
        x = self.feed_forward(x)  #(7)
        #print(f"after feed forward\t: {x.shape}")
        
        x = self.dropout_1(x)
        #print(f"after dropout\t\t: {x.shape}")
        
        x = x + residual
        #print(f"after addition\t\t: {x.shape}")
        
        x = self.norm_1(x)
        #print(f"after normalization\t: {x.shape}")
        
        return x

In [6]:
# Codeblock 6
decoder_gpt_1 = DecoderGPT1()
x = torch.randn(BATCH_SIZE, SEQ_LENGTH, D_MODEL)
look_ahead_mask = create_mask()

x = decoder_gpt_1(x, look_ahead_mask)

original & residual	: torch.Size([1, 512, 768])
after attention		: torch.Size([1, 512, 768])
after dropout		: torch.Size([1, 512, 768])
after addition		: torch.Size([1, 512, 768])
after normalization	: torch.Size([1, 512, 768])

x & residual		: torch.Size([1, 512, 768])
after feed forward	: torch.Size([1, 512, 768])
after dropout		: torch.Size([1, 512, 768])
after addition		: torch.Size([1, 512, 768])
after normalization	: torch.Size([1, 512, 768])


In [10]:
# Codeblock 7a
class GPT1(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.token_embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, 
                                            embedding_dim=D_MODEL)  #(1)
        
        self.positional_encoding = PositionalEncoding()  #(2)
        
        self.decoders = nn.ModuleList([DecoderGPT1() for _ in range(N_LAYERS)])  #(3)
        
        self.linear = nn.Linear(in_features=D_MODEL, out_features=VOCAB_SIZE)  #(4)

        nn.init.normal_(self.token_embedding.weight, mean=0, std=0.02)  #(5)
        nn.init.normal_(self.linear.weight, mean=0, std=0.02)  #(6)

# Codeblock 7b
    def forward(self, x):
        #print(f"original input\t\t: {x.shape}")
        
        x = self.token_embedding(x.long())  #(1)
        #print(f"embedded tokens\t\t: {x.shape}")
        
        x = x + self.positional_encoding()  #(2)
        #print(f"after addition\t\t: {x.shape}")
        
        for i, decoder in enumerate(self.decoders):
            x = decoder(x, attn_mask=look_ahead_mask)  #(3)
            #print(f"after decoder #{i}\t: {x.shape}")
            
        decoder_output = x  #(4)
        #print(f"decoder_output\t\t: {decoder_output.shape}")
        
        text_output = self.linear(x)
        #print(f"text_output\t\t: {text_output.shape}")
        
        return decoder_output, text_output  #(5)

In [9]:
# Codeblock 8
gpt1 = GPT1()

x = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH))
x = gpt1(x)

original input		: torch.Size([1, 512])
embedded tokens		: torch.Size([1, 512, 768])
after addition		: torch.Size([1, 512, 768])
after decoder #0	: torch.Size([1, 512, 768])
after decoder #1	: torch.Size([1, 512, 768])
after decoder #2	: torch.Size([1, 512, 768])
after decoder #3	: torch.Size([1, 512, 768])
after decoder #4	: torch.Size([1, 512, 768])
after decoder #5	: torch.Size([1, 512, 768])
after decoder #6	: torch.Size([1, 512, 768])
after decoder #7	: torch.Size([1, 512, 768])
after decoder #8	: torch.Size([1, 512, 768])
after decoder #9	: torch.Size([1, 512, 768])
after decoder #10	: torch.Size([1, 512, 768])
after decoder #11	: torch.Size([1, 512, 768])
decoder_output		: torch.Size([1, 512, 768])
text_output		: torch.Size([1, 512, 40000])


In [11]:
# Codeblock 9
def count_parameters(model):
    return sum([params.numel() for params in model.parameters()])

count_parameters(gpt1)

146534464

In [14]:
# Codeblock 10
class TaskClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear = nn.Linear(in_features=D_MODEL, out_features=N_CLASS)  #(1)
        nn.init.normal_(self.linear.weight, mean=0, std=0.02)
    
    def forward(self, x):  #(2)
        #print(f"decoder_output\t: {x.shape}")
        
        class_output = self.linear(x)
        #print(f"class_output\t: {class_output.shape}")
        
        return class_output

In [13]:
# Codeblock 11
task_classifier = TaskClassifier()

x = torch.randn(BATCH_SIZE, SEQ_LENGTH, D_MODEL)
x = task_classifier(x)

decoder_output	: torch.Size([1, 512, 768])
class_output	: torch.Size([1, 512, 3])


In [15]:
# Codeblock 12
def gpt1_fine_tune(x, gpt1, task_classifier):
    print(f"original input\t\t: {x.shape}")
    
    decoder_output, text_output = gpt1(x)  #(1)
    print(f"decoder_output\t\t: {decoder_output.shape}")
    print(f"text_output\t\t: {text_output.shape}")
    
    class_output = task_classifier(decoder_output)  #(2)
    print(f"class_output\t\t: {class_output.shape}")
    
    return text_output, class_output

In [16]:
# Codeblock 13
gpt1 = GPT1()
task_classifier = TaskClassifier()

x = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH))
text_output, class_output = gpt1_fine_tune(x, gpt1, task_classifier)

original input		: torch.Size([1, 512])
decoder_output		: torch.Size([1, 512, 768])
text_output		: torch.Size([1, 512, 40000])
class_output		: torch.Size([1, 512, 3])


In [17]:
# Codeblock 14
BATCH_SIZE = 1
SEQ_LENGTH = 1024  #(1)
VOCAB_SIZE = 50257  #(2)

D_MODEL = 1600
NUM_HEADS = 25  #(3)
HIDDEN_DIM = D_MODEL*4  #(4)
N_LAYERS = 48
DROP_PROB = 0.1

In [21]:
# Codeblock 15
class DecoderGPT23(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.norm_0 = nn.LayerNorm(D_MODEL)
        self.multihead_attention = nn.MultiheadAttention(embed_dim=D_MODEL, 
                                                         num_heads=NUM_HEADS, 
                                                         batch_first=True)
        self.dropout_0 = nn.Dropout(DROP_PROB)
        
        self.norm_1 = nn.LayerNorm(D_MODEL)
        self.feed_forward = nn.Sequential(nn.Linear(D_MODEL, HIDDEN_DIM), 
                                          nn.GELU(), 
                                          nn.Linear(HIDDEN_DIM, D_MODEL))
        self.dropout_1 = nn.Dropout(DROP_PROB)
        
        nn.init.normal_(self.feed_forward[0].weight, 0, 0.02)
        nn.init.normal_(self.feed_forward[2].weight, 0, 0.02)

    def forward(self, x, attn_mask):
        residual = x
        #print(f"original & residual\t: {x.shape}")
        
        x = self.norm_0(x)
        #print(f"after normalization\t: {x.shape}")
        
        x = self.multihead_attention(x, x, x, attn_mask=attn_mask)[0]
        #print(f"after attention\t\t: {x.shape}")
        
        x = self.dropout_0(x)
        #print(f"after dropout\t\t: {x.shape}")
        
        x = x + residual
        #print(f"after addition\t\t: {x.shape}")
        
        residual = x
        #print(f"\nx & residual\t\t: {x.shape}")
        
        x = self.norm_1(x)
        #print(f"after normalization\t: {x.shape}")
        
        x = self.feed_forward(x)
        #print(f"after feed forward\t: {x.shape}")
        
        x = self.dropout_1(x)
        #print(f"after dropout\t\t: {x.shape}")
        
        x = x + residual
        #print(f"after addition\t\t: {x.shape}")
                
        return x

In [19]:
# Codeblock 16
decoder_gpt_2 = DecoderGPT23()
x = torch.randn(BATCH_SIZE, SEQ_LENGTH, D_MODEL)
look_ahead_mask = create_mask()

x = decoder_gpt_2(x, look_ahead_mask)

original & residual	: torch.Size([1, 1024, 1600])
after normalization	: torch.Size([1, 1024, 1600])
after attention		: torch.Size([1, 1024, 1600])
after dropout		: torch.Size([1, 1024, 1600])
after addition		: torch.Size([1, 1024, 1600])

x & residual		: torch.Size([1, 1024, 1600])
after normalization	: torch.Size([1, 1024, 1600])
after feed forward	: torch.Size([1, 1024, 1600])
after dropout		: torch.Size([1, 1024, 1600])
after addition		: torch.Size([1, 1024, 1600])


In [20]:
# Codeblock 17
class GPT23(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.token_embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, 
                                            embedding_dim=D_MODEL)
        
        self.positional_encoding = PositionalEncoding()
        
        self.decoders = nn.ModuleList([DecoderGPT23() for _ in range(N_LAYERS)])
        
        self.norm_final = nn.LayerNorm(D_MODEL)  #(1)
        
        self.linear = nn.Linear(in_features=D_MODEL, out_features=VOCAB_SIZE)
        
        nn.init.normal_(self.token_embedding.weight, mean=0, std=0.02)
        nn.init.normal_(self.linear.weight, mean=0, std=0.02)
        
    def forward(self, x):
        print(f"original input\t\t: {x.shape}")
        
        x = self.token_embedding(x.long())
        print(f"embedded tokens\t\t: {x.shape}")
        
        x = x + self.positional_encoding()
        print(f"after addition\t\t: {x.shape}")
        
        for i, decoder in enumerate(self.decoders):
            x = decoder(x, attn_mask=look_ahead_mask)
            print(f"after decoder #{i}\t: {x.shape}")

        x = self.norm_final(x)  #(2)
        print(f"after final norm\t: {x.shape}")
        
        text_output = self.linear(x)
        print(f"text_output\t\t: {text_output.shape}")
        
        return text_output

In [22]:
# Codeblock 18
gpt2 = GPT23()

x = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH))
x = gpt2(x)

original input		: torch.Size([1, 1024])
embedded tokens		: torch.Size([1, 1024, 1600])
after addition		: torch.Size([1, 1024, 1600])
after decoder #0	: torch.Size([1, 1024, 1600])
after decoder #1	: torch.Size([1, 1024, 1600])
after decoder #2	: torch.Size([1, 1024, 1600])
after decoder #3	: torch.Size([1, 1024, 1600])
after decoder #4	: torch.Size([1, 1024, 1600])
after decoder #5	: torch.Size([1, 1024, 1600])
after decoder #6	: torch.Size([1, 1024, 1600])
after decoder #7	: torch.Size([1, 1024, 1600])
after decoder #8	: torch.Size([1, 1024, 1600])
after decoder #9	: torch.Size([1, 1024, 1600])
after decoder #10	: torch.Size([1, 1024, 1600])
after decoder #11	: torch.Size([1, 1024, 1600])
after decoder #12	: torch.Size([1, 1024, 1600])
after decoder #13	: torch.Size([1, 1024, 1600])
after decoder #14	: torch.Size([1, 1024, 1600])
after decoder #15	: torch.Size([1, 1024, 1600])
after decoder #16	: torch.Size([1, 1024, 1600])
after decoder #17	: torch.Size([1, 1024, 1600])
after decoder

In [23]:
# Codeblock 19
count_parameters(gpt2)

1636434257

In [24]:
# Codeblock 20
BATCH_SIZE = 1
SEQ_LENGTH = 2048
VOCAB_SIZE = 50257

D_MODEL = 12288
NUM_HEADS = 96
HIDDEN_DIM = D_MODEL*4
N_LAYERS = 96
DROP_PROB = 0.1

In [None]:
# Codeblock 21
gpt3 = GPT23()  #(1)

x = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH))
x = gpt3(x)  #(2)