# ML Task

Hugging Face is very nice to us to include all the functionality needed for GPT2 to be used in classification tasks.

**Main idea**: Since GPT2 is a decoder transformer, the last token of the input sequence is used to make predictions about the next token that should follow the input. This means that the last token of the input sequence contains all the information needed in the prediction. With this in mind we can use that information to make a prediction in a classification task instead of generation task.

### Install required Libraries


In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 4.7 MB 8.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 49.2 MB/s 
[K     |████████████████████████████████| 596 kB 73.3 MB/s 
[K     |████████████████████████████████| 101 kB 12.1 MB/s 
[?25h

### Import important libraries

In [2]:
import os
import math
import time
import random

import numpy as np
import pandas as pd

import warnings

import torch

from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data.sampler import SubsetRandomSampler

from transformers import (GPT2Tokenizer, 
                          GPT2LMHeadModel,
                          AdamW, 
                          get_linear_schedule_with_warmup)

### Configuration class

In [28]:
class Config:
    patience=5
    num_workers=2
    
    batch_size=256
    lr=0.001
    epochs=10
    load_weight_path=""
    save_file_name="model_weights_gpt2"
    MODEL_NAME="gpt2"
    
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Download Model

In [4]:
def get_model():
    tokenizer = GPT2Tokenizer.from_pretrained(Config.MODEL_NAME)
    SPECIAL_TOKENS_DICT = {
        'pad_token': '<pad>',
    }
    tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)

    model = GPT2LMHeadModel.from_pretrained(Config.MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer

### DataLoader

In [5]:
class SonnetDataset(Dataset):
    def __init__(self,sonnet_files,tokenizer):
      self.sonnet_files=sonnet_files
      self.tokenizer=tokenizer
      self.eos_tok="<|endoftext|>"       
      self.sonnets=self.load_sonnets()  

    def __getitem__(self,idx):
        sonnet=self.sonnets[idx]
        
        inputs=self.tokenizer.encode_plus(
            sonnet,
            None,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            max_length=256,
            )

        ids=inputs["input_ids"]
        mask=inputs["attention_mask"]


        return {"ids":torch.tensor(ids,dtype=torch.long),
                "mask":torch.tensor(mask,dtype=torch.long),
                "target":torch.tensor(ids,dtype=torch.long)
                }

    def load_sonnets(self):
      sonnetlist=[]
      for sonnet_file in self.sonnet_files:
        sonnet=open(sonnet_file,"r").readline()
        sonnet = f"Quote: {str(sonnet)} {self.eos_tok}"
        sonnetlist.append(sonnet)
      return sonnetlist                


    def __len__(self):
        return len(self.sonnets)

### Training Loop

In [29]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class Engine:
    
    def __init__(self, model, device, config, save_file_name = 'model_weights', weight_path='./'):
        
        self.train_loss=dict()
        self.valid_loss=dict()
        self.model=model
        self.device=device
        self.config=config
        self.best_score=0
        self.best_loss=5000
        self.save_file_name = save_file_name
        self.weight_path = weight_path

    def fit(self, train_loader, valid_loader):

      num_train_steps = int(len(train_loader) / self.config.batch_size * self.config.epochs)
      self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.config.lr)
      self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
      
      for epoch in range(self.config.epochs):
        
        t=time.time()
        print("Training Started...")
        
        summary_loss = self.train_one_epoch(train_loader)
        self.train_loss[epoch] = summary_loss.avg

        print(f'Train : Epoch {epoch}: | Summary Loss: {summary_loss.avg} | Training time: {time.time() - t}')
            
        t=time.time()
        print("Validation Started...")
        
        summary_loss = self.validation(valid_loader)
        self.valid_loss[epoch] = summary_loss.avg

        print(f'Valid : Epoch {epoch}: | Summary Loss: {summary_loss.avg} | Training time: {time.time() - t}')
        
        if not self.best_score:
            self.best_score = summary_loss.avg
            print(f'Saving model with lowest validation loss as {self.best_score}')
            self.model.eval()   
            patience = self.config.patience
            torch.save({'model_state_dict': self.model.state_dict(),'best_score': self.best_score, 'epoch': epoch},  f"{self.weight_path}/{self.save_file_name}.pt")
            continue  

        if summary_loss.avg <= self.best_score:
            self.best_score = summary_loss.avg
            patience = self.config.patience  
            print('Improved model with lowest validation loss as {}'.format(self.best_score))
            torch.save({'model_state_dict': self.model.state_dict(),'best_score': self.best_score, 'epoch': epoch},  f"{self.weight_path}/{self.save_file_name}.pt")
        else:
            patience -= 1
            print('Patience Reduced')
            if patience == 0:
                print(f'Early stopping. Lowest validation loss achieved: {self.best_score}')
                break

    def train_one_epoch(self, train_loader):
      self.model.train()

      t = time.time()
      summary_loss = AverageMeter()
      
      for steps, data in enumerate(tqdm(train_loader)):
          ids = data["ids"]
          mask = data["mask"]
          labels = data['target']

          ids = ids.to(self.device, dtype=torch.long)
          mask = mask.to(self.device, dtype=torch.long)
          labels = labels.to(self.device,dtype=torch.long)
            
          self.optimizer.zero_grad()
          outputs = self.model(
              input_ids =ids,
              attention_mask=mask,
              labels = labels
          )

          loss, logits = outputs[:2]                        
          loss.backward()

          self.optimizer.step()
          self.scheduler.step()

          summary_loss.update(loss.detach().item(), self.config.batch_size)

      return summary_loss

    def validation(self, valid_loader):
      self.model.eval()

      t = time.time()
      summary_loss = AverageMeter()

      with torch.no_grad():
        for steps, data in enumerate(tqdm(valid_loader)):
            ids = data["ids"]
            mask = data["mask"]
            labels = data['target']

            ids = ids.to(self.device, dtype=torch.long)
            mask = mask.to(self.device, dtype=torch.long)
            labels = labels.to(self.device,dtype=torch.long)
              
            outputs = self.model(
                input_ids =ids,
                attention_mask=mask,
                labels = labels
            )

            loss, logits = outputs[:2]  
            summary_loss.update(loss.detach().item(), self.config.batch_size) 
      return summary_loss

### Evaluation Loop

In [30]:
def perform_run(data_path,config,model,tokenizer,weight_path='./',load_weights_path=None):
    
    sonnet_files=[data_path]
    datasett=SonnetDataset(sonnet_files,tokenizer)
    indices=list(range(len(datasett)))
    random.shuffle(indices)
    
    split=math.floor(0.3*len(datasett))
    train_indices,val_indices=indices[split:],indices[:split]
    
    train_sampler=SubsetRandomSampler(train_indices)
    val_sampler=SubsetRandomSampler(val_indices)
    
    train_loader=DataLoader(datasett,batch_size=config.batch_size,
                           sampler=train_sampler,num_workers=config.num_workers)
    
    val_loader=DataLoader(datasett,batch_size=config.batch_size,
                           sampler=train_sampler,num_workers=config.num_workers)
    if load_weights_path is not None:
        model.load_state_dict(torch.load(load_weights_path+f"{config.save_file_name}.pt")["model_state_dict"])
        print("Weight loaded")
        
    engine=Engine(model=model.to(config.device),device=config.device,
                config=config,save_file_name=config.save_file_name,
                weight_path=weight_path)
    
    engine.fit(train_loader,val_loader)
    
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True   
    

### Train Model

In [None]:
seed_everything(42)
model,tokenizer=get_model()
perform_run('Sonnets.txt',Config,model,tokenizer,Config.load_weight_path)

In [None]:
model.eval()
model,tokenizer=get_model()
prompt="To eat the world's due, by the grave and thee."
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(Config.device)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )
for i, sample_output in enumerate(sample_outputs):
  print(f"{i}:\n\n{tokenizer.decode(sample_output, skip_special_tokens=True)}")