In [6]:
!pip install tiktoken
!pip install torch
!pip install numpy
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [82]:
import torch
import torch.nn as nn
import math

from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

device = "cuda"
# device = "mps"

# A helper config class that incorporates model's parameters
class Config:
    embed_dropout = 0.1
    ff_dropout = 0.1
    attn_dropout = 0.1

    num_embed = 768
    num_heads = 12
    num_blocks = 12

    batch_size = 32

    def __init__(self, vocab_size, max_seq_len) -> None:
        self.vocab_size = vocab_size
        self.max_seq_len = max_seq_len

In [54]:
class SelfAttention(nn.Module):

    def __init__(self, config: Config):
        super().__init__()

        if config.num_embed % config.num_heads != 0:
            raise ValueError("num_embed % num_heads != 0")
        
        self.num_embed = config.num_embed
        self.num_heads = config.num_heads
        
        self.c_attn = nn.Linear(config.num_embed, 3 * config.num_embed)  # key, query, value
        self.c_proj = nn.Linear(config.num_embed, config.num_embed)

        # regularization
        self.attn_dropout = nn.Dropout(config.attn_dropout)
        self.resid_dropout = nn.Dropout(config.ff_dropout)

        # Mask that makes sure that attention only affects left tokens (previous, not future ones)
        self.register_buffer("bias", torch.tril(torch.ones(config.max_seq_len, config.max_seq_len))
                                     .view(1, 1, config.max_seq_len, config.max_seq_len))

    def forward(self, x):
        B, T, C = x.size()  # batch size, seq len, num_embed

        # query, key, value for every head in batch
        query, key, value = self.c_attn(x).split(self.num_embed, dim=2)

        key = key.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
        query = query.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
        value = value.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)

        # Causal self attention
        atn = (query @ key.transpose(-2, -1)) * (1.0 / math.sqrt(key.size(-1)))
        atn = atn.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        atn = F.softmax(atn, dim=-1)
        atn = self.attn_dropout(atn)

        y = atn @ value
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))

        return y

class Block(nn.Module):
    def __init__(self, config: Config):
        super().__init__()

        self.ln_1 = nn.LayerNorm(config.num_embed)

        self.attention = SelfAttention(config)

        self.ln_2 = nn.LayerNorm(config.num_embed)

        # TODO: check
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.num_embed, 4 * config.num_embed),
            c_proj  = nn.Linear(4 * config.num_embed, config.num_embed),
            act     = nn.GELU(),
            dropout = nn.Dropout(config.ff_dropout),
        ))

        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))

    def forward(self, x):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))

        return x

class GPT(nn.Module):
    def __init__(self, config: Config) -> None:
        super().__init__()

        self.max_seq_len = config.max_seq_len
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.num_embed),
            wpe = nn.Embedding(config.max_seq_len, config.num_embed),
            dropout = nn.Dropout(config.embed_dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.num_blocks)]),
            ln_f = nn.LayerNorm(config.num_embed)
        ))

        self.head = nn.Linear(config.num_embed, config.vocab_size)

    def forward(self, x, targets=None):
        # x.shape = (Batches, Seq length)

        seq_len = x.size(1)

        if seq_len > self.max_seq_len:
            raise ValueError("Sequence length is > max allowed length")
        
        token_emb = self.transformer.wte(x)  # Batch size, seq length, num_embed
        # print(f"token_emb: {token_emb}")

        positions = torch.arange(0, seq_len,
                               dtype=torch.long,
                               device=device).unsqueeze(0)  # (1, max_seq_len)

        pos_emb = self.transformer.wpe(positions)  # 1, max_seq_len, num_embed

        x = self.transformer.dropout(token_emb + pos_emb)
        # print(f"x dropout: {x}")

        for block in self.transformer.h:
            x = block(x)
        # print(f"x blocks: {x}")

        x = self.transformer.ln_f(x)
        # print(f"x ln_f: {x}")

        logits = self.head(x)
        # print(f"logits.shape: {logits.shape}")
        # print(f"targets.shape: {targets.shape}")

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)
            # print(f"LOGITS: {logits}\nTargets: {targets}")

        return logits, loss
    
    def generate(self, xs, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        
        for _ in range(max_new_tokens):
            if xs.size(1) > self.max_seq_len:
                x = xs[:, -self.max_seq_len:]
            else:
                x = xs
            
            logits, _ = self(x)

            # Taking last logits
            logits = logits[:, -1, :] / temperature  # Also scaling by temperature

            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')

            probs = F.softmax(logits, dim=-1)

            # We can either sample from distribution or choose using top_k
            if do_sample:
                x_next = torch.multinomial(probs, num_samples=1)
            else:
                _, x_next = torch.topk(probs, k=1, dim=-1)
            
            xs = torch.cat((xs, x_next), dim=1)  # Adding a chosen token to the sequence
        return xs

In [9]:
# tiktoken

import tiktoken

# enc = tiktoken.encoding_for_model("gpt-4")
enc = tiktoken.get_encoding("gpt2")

test_text = "Hello, it's me!"
test_enc = enc.encode(test_text)

print(test_enc)

[15496, 11, 340, 338, 502, 0]


In [10]:
vocab_size = enc.n_vocab

# print(encoded_data)
# print([enc.decode([token]) for token in encoded_data])
print(f"vocab size: {vocab_size}")

vocab size: 50257


We will be processing several batches in parallel to accelerate training process

In [11]:
max_seq_len = 100

config = Config(vocab_size, max_seq_len)

We can see that the model gives us random output. Let's train it!

In [35]:
import pandas as pd
import random

class JokeDataset(Dataset):
    def __init__(self, file, tokenizer, max_seq_len):
        
        df = pd.read_csv(file, index_col=0)

        self.jokes = []
        for row in df.iterrows():
            encoded = tokenizer.encode(row[1][0])
                                       
            self.jokes.extend(encoded)

    def __len__(self):
        return len(self.jokes)

    def __getitem__(self, idx):
        if len(self.jokes) - idx < max_seq_len + 1:
            idx = len(self.jokes) - max_seq_len - 1

        encoded_sample = self.jokes[idx:idx + max_seq_len + 1]

        x = encoded_sample[:max_seq_len]
        y = encoded_sample[1:]

        return torch.tensor(x), torch.tensor(y)

In [None]:
enc.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})

In [36]:
train_dataset = JokeDataset("shortjokes.csv", enc, config.max_seq_len)

In [24]:
inp, tar = train_dataset.__getitem__(3)

print(enc.decode(inp.tolist()))
print(enc.decode(tar.tolist()))

ating a documentary about narrators] "I can't hear what they're saying cuz I'm talking"Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J.
 a documentary about narrators] "I can't hear what they're saying cuz I'm talking"Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper.If I could have dinner with anyone, dead or alive... ...I would choose alive. -B.J. Nov


In [None]:
torch.autograd.set_detect_anomaly(False)

In [64]:
model = GPT(config).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

model_path = "/content/drive/MyDrive/IU/Semester_6/IU_NLP_Project/model"

In [114]:
import tqdm
import time

num_workers = 2
grad_norm_clip = 1.0

train_loader = DataLoader(
                            train_dataset,
                            sampler=torch.utils.data.RandomSampler(train_dataset, replacement=True, num_samples=int(1e10)),
                            shuffle=False,
                            pin_memory=True,
                            batch_size=config.batch_size,
                            num_workers=num_workers,
                        )

model.train()
epochs = 5
max_iters = 500
iter_num = 0
iter_time = time.time()
data_iter = iter(train_loader)

best_loss = 4.1

for epoch in tqdm.tqdm(range(epochs)):
    for curr_iter in range(max_iters):

        try:
            batch = next(data_iter)
        except StopIteration:
            data_iter = iter(train_loader)  # Start again if reached the end
            batch = next(data_iter)

        batch = [t.to(device) for t in batch]
        x, y = batch
        # print(f"X: {x}")

        logits, loss = model(x, y)
        # print(loss)

        model.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip)
        optimizer.step()
        
        if curr_iter % 100 == 0:
            print(f"epoch: {epoch}; iter_num: {iter_num}; loss: {loss}")
            if loss < best_loss:
                best_loss = loss
                torch.save(model.state_dict(), f"{model_path}_{epoch}_{iter_num}_{loss}")

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 0; iter_num: 0; loss: 4.499292373657227
epoch: 0; iter_num: 0; loss: 4.6021809577941895
epoch: 0; iter_num: 0; loss: 4.5851287841796875
epoch: 0; iter_num: 0; loss: 4.401492118835449


  0%|          | 0/5 [04:46<?, ?it/s]


KeyboardInterrupt: ignored

In [71]:
model.eval()
print(enc.decode(model.generate(xs = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=20)[0].tolist()))

!What do you call a black guy who is a dog? A pilot, you racist.What do


In [57]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [68]:
model_path = "/content/drive/MyDrive/IU/Semester_6/IU_NLP_Project/model"

torch.save(model.state_dict(), model_path)

Use this to test the model

In [83]:
import tiktoken

load_model_path = "/content/drive/MyDrive/IU/Semester_6/IU_NLP_Project/model_loss_4.4"
enc = tiktoken.get_encoding("gpt2")

max_seq_len = 100

config = Config(vocab_size, max_seq_len)
model = GPT(config).to(device)
model.load_state_dict(torch.load(load_model_path))
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(100, 768)
    (dropout): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): GELU(approximate='none')
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  

In [118]:
starting_string = "A man walks into a bar"

start = torch.tensor([enc.encode(starting_string)], dtype=torch.long, device=device)
print(start)

print(enc.decode(model.generate(xs = start, max_new_tokens=30, do_sample=True)[0].tolist()))

tensor([[0]], device='cuda:0')
!A humanjerk walk into a bar by Now it's the bartender looks at it and the other.What do you call a burger that loves clown


In [45]:
torch.cuda.empty_cache()