# GPT-2 Fine Tuning



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 9.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 84.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 88.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 88.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
import os
import time
import datetime
from tqdm import tqdm
from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt
% matplotlib inline

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!nvidia-smi

Sat Apr 30 15:12:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#documents_with_tokens = pd.read_pickle("./drive/MyDrive/Colab Notebooks/CE/data/documents_with_tokens.pickle")
documents_with_tokens_split = pd.read_pickle("./drive/MyDrive/Colab Notebooks/CE/data/hasDOI/documents_with_tokens_split_hep_th.pickle")

### Load Data and Create Dataset

In [None]:
abstracts = documents_with_tokens_split["abstract"]
abstracts

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium


Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [None]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

In [None]:
dataset = GPT2Dataset(abstracts, tokenizer, max_length=768)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

103,912 training samples
11,546 validation samples


In [None]:
batch_size = 2

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

### Fine Tuning

In [None]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model.cuda()

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [None]:
epochs = 10
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
sample_every = 100

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                 )
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
                             attention_mask = b_masks,
                             labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/CE/models/GPT-2/model_hep-th.pt")

In [None]:

torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/CE/models/GPT-2/model_hep-th.pt")

### Extract Embeddings

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
single_dataloader = DataLoader(
            dataset, # The ventire list of abstracts.
            sampler = SequentialSampler(dataset), # Pull out batches sequentially.
            batch_size = 1
        )

Here is where I outline 6 methods for extracting a document embedding from GPT-2. A batch size of 1 abstract is fed into the network and the hidden states are returned. From there we can do an infinite number of different token embeddings extraction methods. I chose 6:

* Output of first hidden layer/embedding layer is a tensor(batch size X number of embeddings X length of feature map). In this case that is (1 x 768 x768)
  1. Take the mean of all embeddings resulting in 1x1x768 length document embedding.
  2. Take the average of each embedding resulting in a 1x768x1 document embedding.
* Take the mean of the last 4 hidden layers.
  1. Take the mean of averaged embeddings resulting in 1x1x768 length document embedding.
  2. Take the average of averaged embedding resulting in a 1x768x1 document embedding.
* concatenate the last 4 hidden layers resulting in a (1 x 768 x 3072) tensor
  1. First perform pooling with kernel size 4, then take the mean of all concatenated embeddings resulting in 1x1x768 length document embedding.
  2. Take the average of each embedding resulting in a 1x768x1 document embedding.


So we see 3 methods for taking or combining hidden layer output and then I take 2 methods for extracing embeddings from each.

In [None]:
gpt2_embeddings = [0] * len(single_dataloader)

pool1d_768 = torch.nn.AvgPool1d(768)
pool1d_4 = torch.nn.AvgPool1d(4)
pool1d_4x768 = torch.nn.AvgPool1d(4*768)

for step, batch in enumerate(tqdm(single_dataloader)):
  
  b_input_ids = batch[0].to(device)
  b_labels = batch[0].to(device)
  b_masks = batch[1].to(device)

  with torch.no_grad():        
      outputs  = model(b_input_ids, 
  #                     token_type_ids=None, 
                        attention_mask = b_masks,
                        labels=b_labels,
                        use_cache=False,
                        output_hidden_states=True
                      )
      
      # ============== FIRST LAYER (TOKEN EMBEDDINGS POOLED INTO 1 DIMENSION) =====================
      first_layer = outputs.hidden_states[0]
      gpt2_embeddings[step] = pool1d_768(first_layer).view(768).cpu().numpy()

      # ============== FIRST LAYER (TOKEN EMBEDDINGS AVERAGED) =====================================
      ''' first_layer = outputs.hidden_states[0]
      gpt2_embeddings[step] = torch.mean(first_layer, dim=1).view(768).cpu().numpy() '''




      # ============== LAST 4 LAYERS AVERAGED (TOKEN EMBEDDINGS POOLED INTO 1 DIMENSION) ===========
      ''' last_four_layer = torch.stack(list(outputs.hidden_states[-4:]), dim=0)
      mean_layer = torch.mean(last_four_layer, dim = 0)
      gpt2_embeddings[step] = pool1d_768(mean_layer).view(768).cpu().numpy() '''

      # ============== LAST 4 LAYERS AVERAGED (TOKEN EMBEDDINGS AVERAGED) ==========================
      ''' last_four_layer = torch.stack(list(outputs.hidden_states[-4:]), dim=0)
      mean_layer = torch.mean(last_four_layer, dim = 0)
      gpt2_embeddings[step] = torch.mean(mean_layer, dim=1).view(768).cpu().numpy() '''
      


      # ============== LAST 4 LAYERS CONCATENATED THEN 1D POOLIING INTO 1 DIMENSION ================
      ''' last_four_layers = outputs.hidden_states[-4:]
      last_four_layers_cat = torch.cat(last_four_layers, dim=2)
      gpt2_embeddings[step] = pool1d_4x768(last_four_layers_cat).view(768).cpu().numpy() '''
      
      # ============== LAST 4 LAYERS CONCATENATED THEN 1D POOLING THEN AVERAGED ====================
      ''' last_four_layers = outputs.hidden_states[-4:]
      last_four_layers_pooled = pool1d_4(torch.cat(last_four_layers, dim=2))
      gpt2_embeddings[step] = torch.mean(last_four_layers_pooled, dim=1).view(768).cpu().numpy() '''



100%|██████████| 115458/115458 [2:36:12<00:00, 12.32it/s]


In [None]:
np.savez_compressed("/content/drive/MyDrive/Colab Notebooks/CE/data/hasDOI/embeddings_hep_th/gpt2_l1pool_embeddings.npz", embedding=gpt2_embeddings)
np.savez_compressed("/content/drive/MyDrive/Colab Notebooks/CE/data/hasDOI/embeddings_hep_th/gpt2_l1mean_embeddings.npz", embedding=gpt2_embeddings)
np.savez_compressed("/content/drive/MyDrive/Colab Notebooks/CE/data/hasDOI/embeddings_hep_th/gpt2_last4catpool_embeddings.npz", embedding=gpt2_embeddings)
np.savez_compressed("/content/drive/MyDrive/Colab Notebooks/CE/data/hasDOI/embeddings_hep_th/gpt2_last4catmean_embeddings.npz", embedding=gpt2_embeddings)


Here we just load and test an embedding to make sure it works.

In [None]:
embeddings_test = np.load("/content/drive/MyDrive/Colab Notebooks/CE/data/hasDOI/embeddings_hep_th/gpt2_l1pool_embeddings.npz", allow_pickle=True)

In [None]:
embeddings_test["embedding"]

array([[-0.00502737, -0.01228824, -0.00704627, ...,  0.00379606,
         0.00330601,  0.00294847],
       [-0.00502737, -0.01228824, -0.01255046, ...,  0.00379606,
         0.00330601,  0.00294847],
       [-0.00502737, -0.01228824, -0.00704627, ...,  0.00379606,
         0.00330601,  0.00294847],
       ...,
       [-0.00502737, -0.01228824, -0.00704627, ...,  0.00379606,
         0.00330601,  0.00294847],
       [-0.00502737, -0.01228824, -0.01151356, ...,  0.00379606,
         0.00330601,  0.00294847],
       [-0.00502737, -0.01228824, -0.00704627, ...,  0.00379606,
         0.00330601,  0.00294847]], dtype=float32)