In [2]:
import json
# from configuration_backpack_gpt2 import BackpackGPT2Config
# from modeling_backpack_gpt2 import BackpackGPT2LMHeadModel
import os
import numpy as np
import pandas as pd
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel
from transformers import AutoConfig, AutoModelForCausalLM
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# !wget https://huggingface.co/datasets/imdb/resolve/main/plain_text/test-00000-of-00001.parquet?download=true
!wget https://huggingface.co/datasets/imdb/resolve/main/plain_text/test-00000-of-00001.parquet?download=true

--2024-03-13 22:17:59--  https://huggingface.co/datasets/imdb/resolve/main/plain_text/test-00000-of-00001.parquet?download=true
Resolving huggingface.co (huggingface.co)... 18.172.78.100, 18.172.78.5, 18.172.78.84, ...
Connecting to huggingface.co (huggingface.co)|18.172.78.100|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/imdb/b52e26e2f872d282ffac460bf9770b25ac6f102cda0e6ca7158df98c94e8b3da?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27test-00000-of-00001.parquet%3B+filename%3D%22test-00000-of-00001.parquet%22%3B&Expires=1710608771&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDYwODc3MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9pbWRiL2I1MmUyNmUyZjg3MmQyODJmZmFjNDYwYmY5NzcwYjI1YWM2ZjEwMmNkYTBlNmNhNzE1OGRmOThjOTRlOGIzZGE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=egkKRcfxG7URo46VDqPFVtN0Ab6Kvhm8RuEl-OVk

In [5]:
!wget https://huggingface.co/datasets/imdb/resolve/main/plain_text/train-00000-of-00001.parquet?download=true

--2024-03-13 22:18:03--  https://huggingface.co/datasets/imdb/resolve/main/plain_text/train-00000-of-00001.parquet?download=true
Resolving huggingface.co (huggingface.co)... 108.159.15.97, 108.159.15.3, 108.159.15.21, ...
Connecting to huggingface.co (huggingface.co)|108.159.15.97|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/imdb/db47d16b5c297cc0dd625e519c81319c24c9149e70e8496de5475f6fa928342c?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27train-00000-of-00001.parquet%3B+filename%3D%22train-00000-of-00001.parquet%22%3B&Expires=1710608775&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMDYwODc3NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9pbWRiL2RiNDdkMTZiNWMyOTdjYzBkZDYyNWU1MTljODEzMTljMjRjOTE0OWU3MGU4NDk2ZGU1NDc1ZjZmYTkyODM0MmM%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=eQ9uHw-ki2PPuuZC8e5LI9n%7E4F700Bu3L

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# IMDB Dataset

In [4]:
class IMDB_DATASET(Dataset):
    def __init__(self, train, tokenizer, max_length = 512):
        if train:
            self.path = '/data1/Code/shivam/NLP/train-00000-of-00001.parquet'
        else:
            self.path = '/data1/Code/shivam/NLP/test-00000-of-00001.parquet'
        self.X, self.y = self._load()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _load(self):
        df = pd.read_parquet(self.path)
        X = df['text'].values.flatten().tolist()
        y = df['label'].values.flatten().tolist()
        return X, y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.tokenizer.bos_token + ' ' + self.X[idx]
        label = self.y[idx]
        inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        label = torch.tensor(label, dtype=torch.long)
        text_len = inputs.attention_mask.sum().item()
        return inputs.input_ids.squeeze(0), label, text_len


In [4]:
class IMDB_DATASET_GPT2(Dataset):
    def __init__(self, train, tokenizer, max_length = 512):
        if train:
            self.path = '/content/train-00000-of-00001.parquet'
        else:
            self.path = '/content/test-00000-of-00001.parquet'
        self.X, self.y = self._load()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _load(self):
        df = pd.read_parquet(self.path)
        X = df['text'].values.flatten().tolist()
        y = df['label'].values.flatten().tolist()
        return X, y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X[idx]
        label = self.y[idx]
        inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, truncation=True)
        label = torch.tensor(label, dtype=torch.long)
        return inputs.input_ids.squeeze(0), label


# Extracting embeddings of Backpack Model

In [6]:
def extract_features(model,dataloader):
    model.eval()
    model.to(device)
    embeddings = []
    labels = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            input_ids = batch[0].to(device)
            label = batch[1]
            text_len = batch[2]
            batch_size = input_ids.shape[0]
            output = model.backpack(input_ids,None).hidden_states
            for j in range(batch_size):
                output_emb = output[j,1:int(text_len[j]) + 1,:]
                embedding = output_emb.mean(dim=0)
                embeddings.append(embedding.cpu().numpy())
            labels.append(label.numpy())

    return np.array(embeddings), np.concatenate(labels,axis = 0)


# Extracting features from GPT2 Model

In [7]:
def extract_features2(model,dataloader):
    model.eval()
    model.to(device)
    embeddings = []
    labels = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            input_ids = batch[0].to(device)
            label = batch[1]
            text_len = batch[2]
            batch_size = input_ids.shape[0]
            output = model.transformer(input_ids)[0]
            for j in range(batch_size):
                output_emb = output[j,1:int(text_len[j]) + 1,:]
                embedding = output_emb.mean(dim=0)
                embeddings.append(embedding.cpu().numpy())
            labels.append(label.numpy())

    return np.array(embeddings), np.concatenate(labels,axis = 0)


# Evaluating Sentiment Analysis on Backpack Models

In [8]:
model_id = "stanfordnlp/backpack-gpt2"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, trust_remote_code=True)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token = '<pad>')
# model.eval()

In [9]:
imdb_dataset_train = IMDB_DATASET(train=True, tokenizer=GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token = '<pad>'))
imdb_dataset_test = IMDB_DATASET(train=False, tokenizer=GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token = '<pad>'))

In [10]:
train_dataloader = DataLoader(imdb_dataset_train, batch_size=100, shuffle=False, num_workers=2)
test_dataloader = DataLoader(imdb_dataset_test, batch_size=100, shuffle=False, num_workers=2)

In [11]:
X_train, y_train = extract_features(model,train_dataloader)

In [20]:
X_test, y_test = extract_features(model,test_dataloader)

100%|██████████| 250/250 [24:03<00:00,  5.77s/it]


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score

In [38]:
log_reg_model = LogisticRegression(C = 0.5, max_iter = 1000)
log_reg_model.fit(X_train, y_train)

In [43]:
y_pred = log_reg_model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print("Train Accuracy:", accuracy)

f1 = f1_score(y_train, y_pred)
print("Train F1 Score:", f1)

Train Accuracy: 0.90376
Train F1 Score: 0.9036829463570857


In [42]:
y_pred = log_reg_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("Test F1 Score:", f1)

Test Accuracy: 0.885
Test F1 Score: 0.8847649204376928


# Evaluating on GPT2

In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer
model_id = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2Tokenizer.from_pretrained(model_id)

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [9]:
imdb_dataset_train = IMDB_DATASET(train=True, tokenizer= tokenizer)
imdb_dataset_test = IMDB_DATASET(train=False, tokenizer=tokenizer)

In [10]:
train_dataloader = DataLoader(imdb_dataset_train, batch_size=50, shuffle=False, num_workers=2)
test_dataloader = DataLoader(imdb_dataset_test, batch_size=50, shuffle=False, num_workers=2)

In [13]:
X_train,y_train = extract_features2(model,train_dataloader)

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [05:18<00:00,  1.57it/s]


In [15]:
X_test,y_test = extract_features2(model,test_dataloader)

100%|██████████| 500/500 [05:19<00:00,  1.57it/s]


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score

In [22]:
log_reg_model = LogisticRegression(C = 1, max_iter = 5000)
log_reg_model.fit(X_train, y_train)

In [23]:
y_pred = log_reg_model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print("Train Accuracy:", accuracy)

f1 = f1_score(y_train, y_pred)
print("Train F1 Score:", f1)

Train Accuracy: 0.90752
Train F1 Score: 0.907468182182022


In [24]:
y_pred = log_reg_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("Test F1 Score:", f1)

Test Accuracy: 0.8954
Test F1 Score: 0.8950262936052347
