In [1]:
import json
from configuration_backpack_gpt2 import BackpackGPT2Config
from modeling_backpack_gpt2 import BackpackGPT2LMHeadModel
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling
from transformers import AutoConfig, AutoModelForCausalLM
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Custom Dataset Class

In [3]:
class CustomDataset(Dataset):
    
    def __init__(self, dir_path,tokenizer):
        self.dir_path = dir_path
        self.dataset = self._load_sentences()
        self.tokenizer = tokenizer

    def _load_sentences(self):
        dataset = []
        for file in os.listdir(self.dir_path):
            if file.endswith(".parquet"):
                file_path = os.path.join(self.dir_path, file)
                df = pd.read_parquet(file_path)
                sentences = df.values.flatten().tolist()
                non_empty_sentences = [sentence for sentence in sentences if sentence]
                dataset.extend(non_empty_sentences)
                
        return dataset
                
    def __len__(self):
        # Return the number of sentences in the dataset
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get a specific sentence by index
        text = self.dataset[idx]
        input_ = self.tokenizer.bos_token + text
        output_ = text + self.tokenizer.eos_token
        input_tokens = self.tokenizer(input_, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        output_tokens = self.tokenizer(output_, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        
        return input_tokens.input_ids.squeeze(0), output_tokens.input_ids.squeeze(0)
        

In [186]:
class OpenWebtext(Dataset):
    
    def __init__(self, dir_path,tokenizer):
        self.dir_path = dir_path
        self.files_path=[os.path.join(dir_path, each) for each in os.listdir(dir_path)][:1000]
        self.dataset = self._load_sentences()
        self.tokenizer = tokenizer

    def _load_sentences(self):
        dataset = []
        for file in self.files_path:
            sentences = [line.strip() for line in open(file, 'r') if line.strip()][1:]
        dataset.extend(sentences)
                        
        return dataset
                
    def __len__(self):
        # Return the number of sentences in the dataset
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get a specific sentence by index
        text = self.dataset[idx]
        input_ = self.tokenizer.bos_token + text
        output_ = text + self.tokenizer.eos_token
        input_tokens = self.tokenizer(input_, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        output_tokens = self.tokenizer(output_, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        
        return input_tokens.input_ids.squeeze(0), output_tokens.input_ids.squeeze(0)
        

# Perplexity function

In [201]:
def compute_perplexity(model,dataloader,tokenizer,total_tokens):
    model.eval()
    model = model.to(device)
    criterion = torch.nn.CrossEntropyLoss(reduction='mean', ignore_index=tokenizer.pad_token_id)
    total_loss = 0
    with torch.no_grad():
        for inputs, outputs in tqdm(dataloader):
            inputs = inputs.to(device)
            outputs = outputs.to(device)
            model_outputs = model(inputs)
            output_logits = model_outputs['logits']
            print(output_logits.view(-1, output_logits.size(-1)))
            loss = criterion(output_logits.view(-1, output_logits.size(-1)), outputs.view(-1))
            total_loss += loss.item()
            # print(loss.item())
            
    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

# Defining Model

In [4]:
model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token = '<pad>')
model.eval()

  return self.fget.__get__(instance, owner)()


BackpackGPT2LMHeadModel(
  (backpack): BackpackGPT2Model(
    (gpt2_model): GPT2Model(
      (wte): Embedding(50264, 768)
      (wpe): Embedding(512, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (sense_network): BackpackSenseNetwork(
      (dropout): Dropout(p=0.1, inplace=False)
      

# Wikitext 103

In [6]:
wiki_text_103_dir_path = "/home/piyush/srinath/NLP/Project/NLP/dataset/Wikitext103/temp"
wiki_text_103_dataset = CustomDataset(wiki_text_103_dir_path, tokenizer)
len(wiki_text_103_dataset) 

2891

In [8]:
total_tokens = sum(len(tokenizer(wiki_text_103_dataset.dataset[i]).input_ids) + 2 for i in range(len(wiki_text_103_dataset)))
total_tokens

289069

In [9]:
dataloader = DataLoader(wiki_text_103_dataset, batch_size=64, shuffle=False)

In [None]:
wiki_text_103_perplexity = compute_perplexity(model, dataloader, tokenizer, total_tokens)

In [12]:
wiki_text_103_perplexity

61.80653746288182

# Lambada OpenAI

In [169]:
path = "/home/piyush/srinath/NLP/Project/NLP/dataset/lambada_openai"
lambada_dataset = CustomDataset(path, tokenizer)
len(lambada_dataset)

5153

In [170]:
total_tokens = sum(len(tokenizer(lambada_dataset.dataset[i]).input_ids) + 2 for i in range(len(lambada_dataset)))
total_tokens

440536

In [171]:
dataloader = DataLoader(lambada_dataset, batch_size=64, shuffle=False)

In [None]:
lambada_perplexity = compute_perplexity(model, dataloader, tokenizer, total_tokens)

In [173]:
lambada_perplexity

39.820683414843465