# GPT2
This notebook performs training and testing of the GPT2 model for TSA on the IMDB movie review dataset

In [1]:
#import libraries
import os
import time
import datetime
import gc
import numpy as np 
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler 
from transformers import (GPT2Tokenizer,
                          AdamW, 
                          GPT2ForSequenceClassification)

In [3]:
def set_seed(seed = 0): # Define seed for reproducability
    '''
    set random seed
    '''
    # random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)
device = 'cuda'

## Utility functions

In [4]:
# process data into train test sets of tokens
def read_dataset(df):
    X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, shuffle=True) # train test split of 0.3
    
    # map labels to integers
    polarity_class = {"negative":0, "positive":1} # binary classes
    y_train = y_train.apply(lambda x: polarity_class[x])
    y_test = y_test.apply(lambda x: polarity_class[x])
    
    return X_train.tolist(), y_train, X_test.tolist(), y_test

# series to list
def pre_process_dataset(values):
    new_values = list()
    
    for value in values:
        new_values.append(value)
    return new_values

# tokenize data to input_ids, attention_masks
def data_process(data, labels):
    input_ids = []
    attention_masks = []
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    for sentence in data:
        inp = tokenizer(sentence, max_length = 150, padding='max_length', truncation=True, return_token_type_ids=False)

        input_ids.append(inp['input_ids'])
        attention_masks.append(inp['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels


def load_and_process(df):
    train_data, train_labels, test_data, test_labels = read_dataset(df)

    train_input_ids, train_attention_masks, train_labels = data_process(pre_process_dataset(train_data), train_labels)
    test_input_ids, test_attention_masks, test_labels = data_process(pre_process_dataset(test_data), test_labels)

    return train_input_ids, train_attention_masks, train_labels,\
           test_input_ids, test_attention_masks, test_labels

def load_data(df):
    # load dataset
    train_input_ids, train_attention_masks, train_labels,\
    test_input_ids, test_attention_masks, test_labels = load_and_process(df)

    train_df = pd.DataFrame(list(zip(train_input_ids, train_attention_masks)), columns=['input_ids', 'attention_masks'])
    test_df = pd.DataFrame(list(zip(test_input_ids, test_attention_masks)), columns=['input_ids', 'attention_masks'])
    
    # import Model and Tokenizer
    model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=2)
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model.config.pad_token_id = model.config.eos_token_id
    # Tokenization 
    # for train set
    train_seq = torch.tensor(train_df['input_ids'].tolist())
    train_mask = torch.tensor(train_df['attention_masks'].tolist())
    train_y = torch.tensor(train_labels.tolist())

    # for test set
    test_seq = torch.tensor(test_df['input_ids'].tolist())
    test_mask = torch.tensor(test_df['attention_masks'].tolist())
    test_y = torch.tensor(test_labels.tolist())

    # Create DataLoaders 
    batch_size = 16 #32 
    train_data = TensorDataset(train_seq, train_mask, train_y) # wrap tensors
    train_sampler = RandomSampler(train_data) # sampler for sampling the data during training
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # dataLoader for train set

    return model, batch_size, train_dataloader, train_y, test_y, train_seq, test_seq, train_mask, test_mask

In [5]:
# Train Model Function
def train(model, batch_size, train_dataloader, optimizer):
    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds = []

    # iterate over batches
    total = len(train_dataloader)
    for i, batch in enumerate(train_dataloader):

        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='') # accuracy={total_accuracy}

        sent_id, mask, labels = batch

        labels = labels.type(torch.LongTensor) # make sure it labels are int64 type

        del batch
        gc.collect()
        torch.cuda.empty_cache()
        
        model.zero_grad() # clear previously calculated gradients
        
        preds = model(sent_id.to(device), token_type_ids=None, attention_mask=mask.to(device), labels=labels.to(device)) # get model predictions for the current batch

        loss = preds[0]
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()

    gc.collect()
    torch.cuda.empty_cache()
    
    avg_loss = total_loss / (len(train_dataloader)*batch_size) # compute the training loss of the epoch

    return avg_loss, total_preds # returns the loss and predictions

## Training

In [6]:
df = pd.read_csv("./IMDB Dataset.csv") # read csv

# get data
gpt2, batch_size, train_dataloader, train_y, test_y, train_seq, test_seq, train_mask, test_mask= load_data(df)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  train_seq = torch.tensor(train_df['input_ids'].tolist())


In [6]:
model = gpt2.to(device) # pass the pre-trained GPT2 model to our defined architecture

# define the optimizer
optimizer = AdamW(
    model.parameters(),
    lr = 5e-5, 
    eps = 1e-8
)

# train loop
best_loss = float('inf') # set initial loss to infinite
epochs = 3
epochs = epochs
current = 1
train_loss_list = []
while current <= epochs: # for each epoch

    print(f'\nEpoch {current} / {epochs}:')

    train_loss, _ = train(model, batch_size, train_dataloader, optimizer) # train model
    train_loss_list.append(train_loss)
    
    # save the best model
    if os.path.isfile('gpt2_3.pth') == False:
        torch.save(model.state_dict(), 'gpt2_3.pth')
        best_loss = train_loss_list[current-1]
        
    if len(train_loss_list) > 1:
        if train_loss_list[current-1] < best_loss:
            best_loss = train_loss_list[current-1]
            torch.save(model.state_dict(), 'gpt2_3.pth')
                
    print(f'\n\nTraining Loss: {train_loss:.3f}')

    current = current + 1

gc.collect()
torch.cuda.empty_cache()




Epoch 1 / 3:
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.02

Training Loss: 0.021

Epoch 2 / 3:
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01

Training Loss: 0.015

Epoch 3 / 3:
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01

Training Loss: 0.010


## Evaluation

In [7]:
gpt_model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=2) # init model
model = gpt_model.to(device)
model.load_state_dict(torch.load("./stored_weights/gpt2_epoch_3_batch_16_token_150.pth")) # load weights
model.config.pad_token_id = model.config.eos_token_id
model.eval()

batch_size = 16  
test_data = TensorDataset(test_seq, test_mask, test_y) # wrap tensors
test_dataloader = DataLoader(test_data, batch_size=batch_size) # dataLoader for train set

total_preds = []
true_labels = []
start = time.time()
for i, batch in enumerate(test_dataloader):
    batch = [r.to(device) for r in batch] # push the batch to gpu
    sent_id, mask, labels = batch
    
    with torch.no_grad():

        outputs = model(sent_id, 
                        token_type_ids=None, 
                        attention_mask=mask)
    
    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    
    total_preds.append(logits)
    true_labels.append(label_ids)

end = time.time()
print(f'Time taken (Evaluation): {end-start}s') # eval time

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time taken (Evaluation): 233.0684745311737s


Classification metrics

In [8]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in total_preds for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

print(classification_report(flat_predictions,flat_true_labels))

              precision    recall  f1-score   support

           0       0.95      0.82      0.88      8554
           1       0.80      0.94      0.86      6446

    accuracy                           0.87     15000
   macro avg       0.87      0.88      0.87     15000
weighted avg       0.88      0.87      0.87     15000



Trainable parameters

In [4]:
gpt_model = GPT2ForSequenceClassification.from_pretrained('gpt2',num_labels=2)
model = gpt_model.to(device)
model.load_state_dict(torch.load("./stored_weights/gpt2_epoch_3_batch_16_token_150.pth"))
model.config.pad_token_id = model.config.eos_token_id
model.eval()

with torch.no_grad():
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Trainable Parameters (GPT2): {pytorch_total_params}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Trainable Parameters (GPT2): 124441344
