# Install and import dependencies

In [None]:
# Install some required libraries
# Feel free to add more if you want
!pip install -q python-levenshtein torchsummaryX==1.1.0 wandb kaggle pytorch-nlp datasets tiktoken

In [None]:
import torch
import torchaudio
from torch import nn, Tensor
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
# import torchsummary
from tqdm.auto import tqdm
import numpy as np
import os

import gc
import time

import pandas as pd
from tqdm.notebook import tqdm as blue_tqdm
import matplotlib.pyplot as plt
import seaborn
import json

import math
from typing import Optional, List

import torchaudio.transforms as tat


#imports for decoding and distance calculation
try:
    import wandb
    import torchsummaryX
    import Levenshtein
except:
    print("Didnt install some/all imports")

import warnings
warnings.filterwarnings('ignore')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", DEVICE)

# Create dataset class and generate datasets

First generate the .bin files containing the tokenized datasets for the respective task in {task}-prep.py. Then import those datasets as numpy arrays inside `train_text` and `val_text` respectively.

In [None]:
train_text =
val_text =

In [None]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, text, window): # Feel free to add more arguments
        self.text = text
        self.window = window
        self.length = (len(text) - 1) // (window)


    def __len__(self):
        return self.length

    def __getitem__(self, ind):

        i = ind * self.window

        x = torch.from_numpy((self.text[i:i+self.window]).astype(np.int64))
        y = torch.from_numpy((self.text[(i+1):i+1+self.window]).astype(np.int64))

        return x, y

In [None]:
train_data = TextDataset(
    text=train_text,
    window=512,
)
val_data = TextDataset(
    text=val_text,
    window=512,
)

In [None]:
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = 32,
    pin_memory  = True,
    shuffle     = True
)
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = 32,
    pin_memory  = True,
    shuffle     = False
)

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))

In [None]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    x, y = data
    print(x.shape, y.shape)
    break

# Define model

In [None]:
config = {
    'context_len': 512,
    'vocab_size': 50304,
    'num_heads': 8,
    'd_model': 512
}

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CausalSelfAttention(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=True)
        self.c_proj = nn.Linear(n_embd, n_embd, bias=True)
        self.drop = nn.Dropout(0.1)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = 0.1

    def forward(self, x):
        B, T, C = x.size()
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        out = self.drop(self.c_proj(out))
        return out

class FeedFoward(nn.Module):

    def __init__(self, n_embd, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class DecoderBlock(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        self.sa = CausalSelfAttention(n_embd, n_head)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model, max_seq_len= config['context_len'], dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        x = self.dropout(x)
        return x

class Decoder(nn.Module):

    def __init__(self, n_layer=5):
        super().__init__()
        self.token_embedding_table = nn.Embedding(config['vocab_size'], config['d_model'])
        self.positional_encoding = PositionalEncoding(config['d_model'])
        self.blocks = nn.Sequential(*[DecoderBlock(n_embd=config['d_model'], n_head=config['num_heads']) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(config['d_model'])
        self.lm_head = nn.Linear(config['d_model'], config['vocab_size'])

    def forward(self, toks, targets=None):
        B, T = toks.shape
        tok_emb = self.token_embedding_table(toks)
        x = self.positional_encoding(tok_emb)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        return logits

    def generate(self, tok, max_out):
        for _ in range(max_out):
            tok_cond = tok[:][-config['context_len']:]
            logits = self.forward(tok_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            tok_next = torch.multinomial(probs, num_samples=1)
            tok = torch.cat((tok, tok_next), dim=1)
        return tok[:, -max_out:]


In [None]:
model   = Decoder(
    n_layer=5
).to(DEVICE)


print(model)

x = torch.randint(low=0, high=100, size=(32, 512))

# print(x)

torchsummaryX.summary(model, x.to(DEVICE))

# Loss Function, Optimizers, Scheduler

In [None]:
optimizer   = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=5e-3)

criterion   = torch.nn.CrossEntropyLoss(ignore_index=0)

scaler = torch.cuda.amp.GradScaler()

scheduler   = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

# Train function

In [None]:
def calculate_loss(logits, targets, criterion):
    B, T, C = logits.shape
    logits = logits.view(B*T, C)
    targets = targets.view(B*T)

    loss    = criterion(logits, targets)

    return loss

def train(model, dataloader, criterion, optimizer):

    model.train()

    batch_bar = blue_tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    window = 4000
    window_loss        = 0.0
    windex = 0
    best_val_loss = float("inf")
    epoch = 0

    for i, (x, y) in enumerate(dataloader):
        windex = i%window

        if windex == 0:
            print('[TRAIN] \tEpoch %d \tLoss: %.4f \tLr: %.6f'
                      % (epoch, (window_loss/window), optimizer.param_groups[0]['lr']))

            wandb.log({
                'train_loss': (window_loss/window),
                'lr'        : optimizer.param_groups[0]['lr']
            })

            print("Saving model")
            torch.save(
                {'model_state_dict'         : model.state_dict(),
                'optimizer_state_dict'     : optimizer.state_dict(),
                'scheduler_state_dict'     : scheduler.state_dict(),
                'window_loss'                  : (window_loss/window),
                'epoch'                    : epoch},
                "./model.pth"
            )
            wandb.save("./model.pth")
            print("Saved best model")
            print("Saving artifact...")
            artifact = wandb.Artifact('model', type='model')
            artifact.add_file("./model.pth")
            run.log_artifact(artifact)

            window_loss = 0.0
            epoch += 1

        optimizer.zero_grad()

        x, y = x.to(DEVICE), y.to(DEVICE)
        global x_g
        x_g = x
        global y_g
        y_g = y

        with torch.cuda.amp.autocast():

            raw_predictions = model(x, y)

            loss        =  calculate_loss(raw_predictions, y, criterion)

            window_loss        += loss.item()

        # Backward on the masked loss
        scaler.scale(loss).backward()

        # Unscales the gradients of optimizer’s assigned params in-place
        scaler.unscale_(optimizer)
        # Since the gradients of optimizer’s assigned params are unscaled, clips as usual:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Optional: Use torch.nn.utils.clip_grad_norm to clip gradients to prevent them from exploding, if necessary
        # If using with mixed precision, unscale the Optimizer First before doing gradient clipping

        scaler.step(optimizer)
        scaler.update()


        batch_bar.set_postfix(
            loss="{:.04f}".format((window_loss)/(windex+1)),
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update()

        del x, y
        torch.cuda.empty_cache()

    batch_bar.close()



# Wandb


In [None]:
# Login to Wandb
import wandb
wandb.login(key="")
# Initialize your Wandb Run Here
# Save your model architecture in a txt file, and save the file to Wandb

In [None]:
run = wandb.init(
    name = "sdf", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    #id = 'xdpn2pcl', #Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw5", ### Project should be created in your wandb account
)

In [None]:
### Save your model architecture as a string with str(model)
model_arch  = str(model)

### Save it in a txt file
arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

# Training loop

In [None]:


train(model, train_loader, criterion, optimizer)


## Download a checkpoint (optional)
Run the below cell to download a checkpoint from WandB, altering the model version as needed.

In [None]:
# Load checkpoint
artifact = run.use_artifact('ojjs/hw5/model:v59', type='model')
artifact_dir = artifact.download()
artifact_dir

Run the cell below to load the associated model that was downloaded above, changing the model number.

In [None]:
checkpoint = torch.load('/content/artifacts/model:v59/model.pth')
model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
checkpoint  = torch.load("./model.pth")
model.load_state_dict(checkpoint['model_state_dict'])

# Generate text

Use the code chunk below, indicating a starting prompt and max_token_len to generate text using your model.

In [None]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

prompt = "Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \"golden anniversary\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \"Super Bowl L\"), so that the logo could prominently feature the Arabic numerals 50. Question: What color was used to emphasize the 50th anniversary of the Super Bowl? Answer: "
max_token_len = 128

prompt = torch.tensor([enc.encode(text)]).to(DEVICE)
model.eval()
idx = model.generate(prompt, max_token_len)
print(enc.decode(idx[0].tolist()))

# Make predictions for the summarization task
Evaluated on the test set of the cnn_dailymail dataset

In [None]:
from datasets import load_dataset, load_metric
import sacrebleu

def preprocess_data(examples):
    # Adjust based on your model's max input length
    zamps = []
    for article in examples['article']:
        # add the tags
        article = enc.encode_ordinary(article)[:506]
        article = enc.encode_ordinary("Article: ") + article + enc.encode_ordinary(" Summary: ")
        article = [enc.eot_token]*(512 - len(article)) + article
        zamps.append(article)
    return zamps

def generate_summary(batch, model):
    with torch.no_grad():
        outputs = model.generate(batch, 50)
    trunc_outs = []
    for out in outputs:
        out = out.tolist()
        if enc.eot_token in out:
            out = out[:(out.index(enc.eot_token))]
        trunc_outs.append(out)

    return enc.decode_batch(trunc_outs)

dataset = load_dataset('cnn_dailymail', '3.0.0')
data = preprocess_data(dataset['test'])

predictions = []
for i in tqdm(range(batches)):
    batch = data[i:i+32].to(DEVICE)
    summ = generate_summary(batch, model)
    predictions.extend(summ)

bleu_scores = [sacrebleu.raw_corpus_bleu([pred], [[ref]]) for pred, ref in zip(predictions, dataset['test']['highlights'])]
average_bleu = sum(score.score for score in bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu}")

# Create SQuAD predictions

First download `dev-v2.0.json` from the SQuAD website, then run below script. To see scores, run the evaluation script in `eval.py` with the dev set.

In [None]:
import json
import blue_tqdm as tqdm
import torch

def generate_answer(context, question):
    input_text = "Context: " + context
    input_text += " Question: " + question
    input_text += " Answer: "
    prompt = torch.tensor([enc.encode(input_text)]).to(DEVICE)

    answer_start = len(prompt[0])
    model.eval()
    model_out = model.generate(prompt, 50)[0].tolist()
    try:
      answer_end = model_out.index(enc.eot_token)
    except:
      answer_end = len(model_out)

    answer_enc = model_out[answer_start:answer_end]
    return enc.decode(answer_enc)

# Load the SQuAD development dataset.
with open('/content/data/dev-v2.0.json', 'r') as file:
    squad_data = json.load(file)

answers = {}

# Process each article, paragraph, and question.
for article in tqdm(squad_data['data']):
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            gen_ans = generate_answer(context, question)
            answers[id] = gen_ans

for id in tqdm(answers):
    new = answers[id].replace('Answer:', '')
    answers[id] = new.strip()

with open('dev-with-generated-answers.json', 'w') as outfile:
    json.dump(answers, outfile, indent=4)