In [None]:
!pip install datasets
!pip install tokenizers
!pip install sacrebleu
!pip install colab-convert
!rm -rf gtGPT/
!rm -rf gtgpt
!git clone https://github.com/Helw150/gtGPT gtGPT
!mv gtGPT/gtgpt/ .

from gtgpt.utils import set_seed
from google.colab import drive
import os
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import html
import random
import numpy as np
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import torch.nn.functional as F
from gtgpt.model import DummyMultiHeadedSelfAttention, DummyBlock, DummyTransformer, DummyEmbedding
from gtgpt.utils import set_seed
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.set_default_device(DEVICE)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:2"
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"


drive.mount('/content/drive')

os.chdir("/content/drive/My Drive/CS7650/Final Project")

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/CS7650/Final Project'

In [None]:
train = pd.read_csv('train.tsv', sep='\t', header=None)
test = pd.read_csv('test.tsv', sep='\t', header=None)
valid = pd.read_csv('valid.tsv', sep='\t', header=None)

In [None]:
class Embedding(DummyEmbedding):
    def forward(self, idx):
        """
        :param idx: intTensor of shape (B,T)
        :returns embeddings: floatTensor of shape (B,T,n_embd)
        """
        B, T = idx.size()
        embeddings = None

        token_embeddings = self.vocab_embeddings(idx)

        positions = torch.arange(T, device=idx.device).expand(B, T)

        position_embeddings = self.position_embeddings(positions)

        embeddings = token_embeddings + position_embeddings
        return embeddings

In [None]:
class GenericSelfAttention(DummyMultiHeadedSelfAttention):
    def forward(self, x, attention_mask):
        """
        :param x: float Tensor of shape (batch size, sequence length, embedding dimensionality)
        :param attention_mask: int Tensor of shape (batch size, 1, sequence length, sequence_length)
        :returns y: float Tensor of shape (batch size, sequence length, embedding dimensionality)
        """
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        y = None
        head_dim = C // self.n_head
        q = self.q(x).view(B, T, self.n_head, head_dim).transpose(1, 2)
        k = self.k(x).view(B, T, self.n_head, head_dim).transpose(1, 2)
        v = self.v(x).view(B, T, self.n_head, head_dim).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / (head_dim ** 0.5)
        if attention_mask is not None:
            scores = scores.masked_fill(attention_mask == 0, float('-inf'))
        attn = F.softmax(scores, dim=-1)
        attn = self.attn_dropout(attn)

        y = torch.matmul(attn, v)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        y = self.hidden_dropout(y)

        return y

In [None]:
class TransformerBlock(DummyBlock):
    def __init__(self, config):
        super().__init__(config, GenericSelfAttention)

    # A Basic Transformer Block with Attention followed by an MLP
    # note the layer norms and residual information preserved at each step.
    def forward(self, x, attention_mask):
        x = x + self.attn(self.ln_1(x), attention_mask)
        x = x + self.mlpf(self.ln_2(x))
        return x

In [None]:
class GenericTransformer(DummyTransformer):
    def __init__(self, config):
        super().__init__(config, TransformerBlock, Embedding)
        self.block_size = config.block_size # Maximum Number of Tokens which can be encoded at once
        self.vocab_size = config.vocab_size

    def get_attention_mask(self, num_tokens):
        """
        Dummy For now, we will see how we use this later!
        """
        B = num_tokens.shape[0]
        return torch.ones((B, self.block_size, self.block_size))[:, :num_tokens.max().item(), :num_tokens.max().item()]

    def forward(self, idx, targets=None, hidden_cache=None, return_hidden=False):
        """
        :param idx: int Tensor of shape (B,T)
        :param hidden_cache: float Tensor of shape (B,P_T,n_embd)
        :param targets: int Tensor of shape (B,T_T)
        :param return_hidden: bool
        (if return_hidden = None)
        :returns x: float Tensor of shape (B,T,n_embd)
        (else)
        :returns logits: float Tensor of shape (B, T, vocab_size)
        :returns loss: float Tensor of shape (B) or None
        """
        num_tokens = (idx != -1).type(torch.int).sum(dim=1)
        if hidden_cache is not None:
          num_tokens = num_tokens + hidden_cache.shape[1]
        idx = idx.masked_fill(idx == -1, int(0)).type(torch.int)[:, :num_tokens.max().item()]
        if targets is not None:
          targets = targets[:, :num_tokens.max().item()]
        attention_mask = self.get_attention_mask(num_tokens)

        x = self.transformer['embedding'](idx)
        if hidden_cache is not None:
            x = torch.cat([hidden_cache, x], dim=1)

        for block in self.transformer['h']:
            x = block(x, attention_mask)
            x = self.transformer['ln_f'](x)

        if x is not None:
            logits = self.lm_head(x)

        if return_hidden:
            return x

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            s_logits = logits
            if hidden_cache is not None:
              s_logits = logits[:, hidden_cache.shape[1]-1:-1].contiguous()
              #print(logits[-1].argmax(dim=1))
            loss = F.cross_entropy(
                s_logits.reshape(-1, self.vocab_size), targets.reshape(-1), ignore_index=-1
            )


        return logits, loss

In [None]:
class Encoder(GenericTransformer):
    """Encoder Style Transformer with Bidirectional Attention"""
    def get_attention_mask(self, num_tokens):
        """
        :param num_tokens: int Tensor of shape (batch size)
        :returns attention_mask: int tensor of shape (batch_size, 1, max_tokens, max_tokens)
        """
        B = num_tokens.shape[0]
        max_tokens = min(self.block_size, num_tokens.max().item())

        T = torch.arange(max_tokens).expand(B, max_tokens)

        attention_mask = (T < num_tokens.reshape(B, 1)).int()

        attention_mask = attention_mask.reshape(B, 1, max_tokens).expand(B, max_tokens, max_tokens)
        return attention_mask.reshape(B, 1, max_tokens, max_tokens)

In [None]:
class Decoder(Encoder):
    """Decoder Style model with a Causal Attention Mask"""

    def get_attention_mask(self, num_tokens):
        """
        :param num_tokens: int Tensor of shape (batch size)
        :returns attention_mask: int tensor of shape (batch_size, 1, block_size, block_size)
        """
        full_attention_mask = super().get_attention_mask(num_tokens)
        attention_mask = torch.tril(full_attention_mask)
        return attention_mask

In [None]:
def generate(model, idx, max_new_tokens, temperature=1.0):
    """
    :param idx: int Tensor of shape (B, T)
    :param max_new_tokens: int
    :param temperature: Float
    :returns idx: int Tensor of shape (B, T+max_new_tokens)
    """
    for _ in range(max_new_tokens):
        logits, _ = model(idx)

        logits = logits[:, -1, :] / temperature

        prob = F.softmax(logits, dim=-1)

        next = torch.multinomial(prob, num_samples=1)

        idx = torch.cat([idx, next], dim=1)
    return idx

In [None]:
class EncoderDecoder(nn.Module):
    """Encoder-Decoder Model which combines the two architectures"""
    def __init__(self, encoder_config, decoder_config):
        super().__init__()
        # Add end of sequence token.
        decoder_config.vocab_size += 1
        self.vocab_size = decoder_config.vocab_size
        self.encoder = Encoder(encoder_config)
        self.decoder = Decoder(decoder_config)

    def configure_optimizers(self, train_config):
        enc_groups = self.encoder.configure_optimizers(train_config)
        dec_groups = self.decoder.configure_optimizers(train_config)
        return enc_groups + dec_groups

    def forward(self, prefix, targets=None):
        """
        :param prefix: int Tensor of shape (B,P_T)
        :param idx: float Tensor of shape (B,P_T,n_embd)
        :returns logits: float Tensor of shape (B, vocab_size)
        :returns loss: float Tensor of shape (B) or None
        """
        B = prefix.shape[0]
        idx = torch.tensor([[]]).repeat(B, 1)
        if targets is not None:
          idx = torch.cat((idx, targets), dim=1)
        encoder_hidden = self.encoder(prefix, return_hidden=True)

        logits, loss = self.decoder(targets, hidden_cache=encoder_hidden)
        return logits, loss


In [None]:
def prefix_generate(model, prefix, max_new_tokens, temperature=1.0):
    """
    :param prefix: int Tensor of shape (B, T)
    :param max_new_tokens: int
    :param temperature: Float
    :returns idx: int Tensor of shape (B, max_new_tokens)
    """
    idx = torch.tensor([[]], dtype=torch.long)
    with torch.no_grad():
        hidden_states = model.encoder(prefix, return_hidden=True)
        for _ in range(max_new_tokens):
            logits = model.decoder(idx, hidden_cache=hidden_states)

            logits = logits[0][:, -1, :] / temperature

            prob = F.softmax(logits, dim=-1)

            next_token = torch.multinomial(prob, num_samples=1)

            idx = torch.cat((idx, next), dim=1)

        idx = idx[:, -max_new_tokens:]
    return idx

In [None]:
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from gtgpt.trainer import Trainer

import pickle

class SortDataset(Dataset):
    """
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=6, num_digits=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits

    def __len__(self):
        return 10000 # ...

    def get_vocab_size(self):
        return self.num_digits

    def get_block_size(self):
        # the length of the sequence that will feed into transformer,
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return 20

    def __getitem__(self, idx):

        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok

        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:self.length].clone()
        y = cat[self.length:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        return x, y

In [None]:
from gtgpt.trainer import Trainer
from tqdm import tqdm
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import UnigramTrainer, BpeTrainer
from tokenizers.models import Unigram, BPE
from datasets import load_dataset
import random

class LMDataset(Dataset):
    def __init__(self, split, data, tokenizer, model):
        assert split in {'train', 'test'}
        self.model_type = "EncDec" if issubclass(type(model), EncoderDecoder) else "Dec"
        if split == "train":
          self.start_split = 0
          self.end_split = 30000
        else:
          self.start_split = 30000
          self.end_split = 40000
        self.split = split
        self.data = data
        self.tokenizer = tokenizer
        self.block_size = max([len(self.tokenizer.encode(inp)) for inp in self.data])
        self.process()

    def __len__(self):
        return len(self.data[self.start_split:self.end_split])

    def get_vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def get_block_size(self):
        # the length of the sequence that will feed into transformer,
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.block_size

    def process(self):
      new_data = []
      for inp in tqdm(self.data):
        if self.model_type == "EncDec":
          x_inp = inp.split("[SEP]")[0] + "[SEP]"
          y_inp = inp.split("[SEP]")[1]
          x = self.tokenizer.encode(x_inp)
          y = self.tokenizer.encode(y_inp)
        else:
          x = self.tokenizer.encode(inp)
          y = x[1:]
          x = x[:-1]
        x = x + ([-1] * (self.get_block_size() - len(x)))
        y = y + ([-1] * (self.get_block_size() - len(y)))
        new_data.append((x, y))
      self.data = new_data

    def __getitem__(self, idx):
      x, y = self.data[self.start_split + idx]
      return torch.tensor(x), torch.tensor(y)

def format_review(row):
  return {"text": f"{row['translation']['eng']}[SEP]{row['translation']['engyay']}[END]"}

In [None]:
class Tokenizer():
  def __init__(self):
    self.DELIM = "|[DELIM]|"
    self.special_tokens = ["[SEP]", "[END]"]
    self.special_tokens = [self.stringify(list(bytes(tok, "utf-8"))) for tok in self.special_tokens]
    self.vocab_size = 256 + len(self.special_tokens)

  def stringify(self, b_enc):
    s_enc = [str(b) for b in b_enc]
    return self.DELIM.join(s_enc)

  def get_vocab_size(self):
    return self.vocab_size

  def encode(self, inp):
    s_enc = self.stringify(list(bytes(inp, "utf-8")))
    for i, tok in enumerate(self.special_tokens):
      s_enc = s_enc.replace(tok, str(255+i+1))
    return [int(s) for s in s_enc.split(self.DELIM)]

  def decode(self, inp):
    s_enc = self.stringify(inp)
    for i, tok in enumerate(self.special_tokens):
      s_enc = s_enc.replace(str(255+i+1), tok)
    return  bytes([int(c) for c in s_enc.split(self.DELIM)])

In [None]:
from tqdm import tqdm
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.set_default_device(DEVICE)
def train(data, model_type="Decoder",
          learning_rate = 5e-4,
          batch_size = 16,
          max_iters = 10000,
          dec_n_layer=1,
          dec_n_embd=52,
          dec_n_head = 1,
          enc_n_layer=None,
          enc_n_embd=None,
          enc_n_head=None):
  # Model Setup
  tokenizer = Tokenizer()
  dec_config = DummyTransformer.get_default_config()
  dec_config.vocab_size = tokenizer.get_vocab_size()
  dec_config.block_size = max([len(tokenizer.encode(inp)) for inp in data])
  dec_config.n_layer = dec_n_layer
  dec_config.n_embd = dec_n_embd
  dec_config.n_head = dec_n_head
  if model_type == "Decoder":
    model = Decoder(dec_config)
  else:
    enc_config = DummyTransformer.get_default_config()
    enc_config.vocab_size = tokenizer.get_vocab_size()
    enc_config.block_size = max([len(tokenizer.encode(inp)) for inp in data])
    enc_config.n_layer = enc_n_layer
    enc_config.n_embd = enc_n_embd
    enc_config.n_head = enc_n_head
    model = EncoderDecoder(enc_config, dec_config)

  # Training Config
  train_config = Trainer.get_default_config()
  train_config.learning_rate = learning_rate
  train_config.max_iters = max_iters
  train_config.batch_size = batch_size
  train_config.num_workers = 0
  train_config.device = DEVICE
  train_ds = LMDataset("train", data, tokenizer, model)
  # Training Loop
  trainer = Trainer(train_config, model, train_ds)
  def batch_end_callback(trainer):
      if trainer.iter_num % 100 == 0:
          tqdm.write(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
          prefix = torch.tensor([tokenizer.encode("translate this to piglatin[SEP]")])
          if model_type == "Decoder":
            output = generate(model, prefix, 100, 0.1)
          else:
            output = prefix_generate(model, prefix, 100, 0.1)
          print(tokenizer.decode(output.cpu().numpy()[0]).split(bytes("[END]", "utf-8"))[0])
  trainer.set_callback('on_batch_end', batch_end_callback)
  trainer.run()
  return model, trainer

In [None]:
train_df = pd.read_csv('train.tsv', sep='\t', header=None)
train_df = train_df.iloc[:, 1:3]
train_df.columns = ['label', 'data']
train_df['formatted_text'] = train_df.apply(lambda row: f"{row['data']}[SEP]{row['label']}[END]", axis=1)

data = train_df['formatted_text'].tolist()

In [None]:
model, trainer = train(data, model_type="Decoder",
          learning_rate = 5e-4,
          batch_size = 16,
          max_iters = 10000,
          dec_n_layer=4,
          dec_n_embd=128,
          dec_n_head =4,
          enc_n_layer=None,
          enc_n_embd=None,
          enc_n_head=None)
model.eval()

number of parameters: 1.24M


100%|██████████| 10240/10240 [00:01<00:00, 5296.82it/s]


running on device cuda:0
iter_dt 0.00ms; iter 0: train loss 5.57106
b"translate this to piglatin[SEP]md\xb0'Le \x1bo\xc6\x9fg\x1c=\xd6\xc0O\x08enh\x94\x86\xc2\x9a*\xb2H\x17\xf4*EE\xcf\xb2\xb1:\xfc\xea\x05(T J\xb9\xbd\xc5 e]\xdeth\x8a\xfcrox$\xa1o\xe5\x11p\xcb\xb7\x84\xae\xf3\xea\xea\xb3\xda\xae\xc66\xf1\xda\xef\x84wr9\x92\x8fW\x82\x94\x86\xacse\x0fk\xb9C\xc5[SEP]\x08\xe3"
iter_dt 52.28ms; iter 100: train loss 2.57359
b'translate this to piglatin[SEP]ban the t the the the the the the the the the the the the the the the are the the the the t the the '
iter_dt 51.97ms; iter 200: train loss 2.39484
b'translate this to piglatin[SEP]me'
iter_dt 50.96ms; iter 300: train loss 2.39573
b'translate this to piglatin[SEP]haly-true'
iter_dt 70.26ms; iter 400: train loss 2.34575
b'translate this to piglatin[SEP]fare'
iter_dt 50.75ms; iter 500: train loss 2.26843
b'translate this to piglatin[SEP]faly-true'
iter_dt 50.80ms; iter 600: train loss 2.22815
b'translate this to piglatin[SEP]farue'
iter_dt 55

Decoder(
  (transformer): ModuleDict(
    (embedding): Embedding(
      (vocab_embeddings): Embedding(258, 128)
      (position_embeddings): Embedding(3205, 128)
    )
    (h): ModuleList(
      (0-3): 4 x TransformerBlock(
        (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): GenericSelfAttention(
          (k): Linear(in_features=128, out_features=128, bias=True)
          (v): Linear(in_features=128, out_features=128, bias=True)
          (q): Linear(in_features=128, out_features=128, bias=True)
          (c_proj): Linear(in_features=128, out_features=128, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (hidden_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=128, out_features=512, bias=True)
          (c_proj): Linear(in_features=512, out_features=128, bias=True)
          (act): NewGELU()

In [None]:
torch.save(model.state_dict(), 'transformer_decoder.pth')

In [None]:
model = Decoder(dec_config)

model.load_state_dict(torch.load('trained_model.pth'))

model.eval()

In [None]:
from sacrebleu.metrics import BLEU

def eval(trainer, data, tokenizer):
    bleu = BLEU()
    results = []
    mistakes_printed_already = 0
    tgts = []
    cands = []
    for sent in tqdm(data[10000:10100]):
        inp = torch.tensor([tokenizer.encode(sent.split("[SEP]")[0] + "[SEP]")])
        tgt = bytes(sent.split("[SEP]")[1].split("[END]")[0], "utf-8")
        cat = generate(model, inp, model.block_size-len(inp[0]), 0.1)
        tgt_candidate = tokenizer.decode(cat.cpu().numpy()[0])
        tgt_candidate = tgt_candidate.split(b"[END]")[0].split(b"[SEP]")[1]
        # compare the predicted sequence to the true sequence
        tgts.append([str(tgt)])
        cands.append(str(tgt_candidate))
        correct = (tgt == tgt_candidate)
        results.append(correct)
    results = torch.tensor(results).type(torch.float)
    print("\n\nExact Match: %d/%d = %.2f%% correct" % (torch.sum(results), len(results), 100*torch.mean(results)))
    score = bleu.corpus_score(cands, tgts)
    print(score)

    return results

with torch.no_grad():
  results = eval(trainer, data, Tokenizer())

100%|██████████| 100/100 [59:54<00:00, 35.95s/it]



Exact Match: 21/100 = 21.00% correct
BLEU = 0.00 100.0/0.0/0.0/0.0 (BP = 1.000 ratio = 1.000 hyp_len = 1 ref_len = 1)



