## Conditional Text Generation with GPT-2

### Install and import libraries

In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 4.2 MB 41.4 MB/s 
[K     |████████████████████████████████| 596 kB 46.4 MB/s 
[K     |████████████████████████████████| 86 kB 4.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 30.8 MB/s 
[?25h

In [1]:
!nvidia-smi

Fri Jun 10 15:55:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import json
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output
from sklearn.model_selection import train_test_split

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.11.0+cu113


### Configurations

In [3]:
DEBUG           = False

INPUT_DIR       = 'data'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TEST_SIZE       = 0.2

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

### Load Dataset

In [5]:
data_df = pd.read_csv("train.csv")
print(f"df size: {len(data_df) :,}")
display(data_df.head(2))

with open('train_ners.json', "r", encoding="utf-8") as json_file:
    all_ners = json.load(json_file)
print(f"number of ners:{len(all_ners) :,}")


df size: 6,966


Unnamed: 0,topic,title,description,ners
0,sport,jets chairman christopher johnson won't fine p...,“i never want to put restrictions on the speec...,[]
1,sport,trump posthumously pardons boxer jack johnson,the pardoning of the black heavyweight boxer i...,[]


number of ners:3,428


### Datasets and loaders

In [15]:
class CommentGeneratorDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):

        self.title = data['title'].tolist()
        self.desc  = data['description'].tolist()
        self.ners = [eval(ners) for ners in data['ners'].tolist()]
        self.randomize = randomize
        self.tokenizer = tokenizer 

    @staticmethod
    def join_ners(ners, randomize=True):
        N = len(ners)
        if randomize: 
            M = random.choice(range(N+1))
            ners = ners[:M]
            random.shuffle(ners)
        return ','.join(ners)

    def __len__(self):
        return len(self.title)
    
    def __getitem__(self, i):
        ners = self.ners[i].copy()
        ners = self.join_ners(ners, self.randomize)
        
        input = SPECIAL_TOKENS['bos_token'] + self.title[i] + \
                SPECIAL_TOKENS['sep_token'] + ners + SPECIAL_TOKENS['sep_token'] + \
                str(self.desc[i]) + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [7]:
def split_data(data, test_size, seed):
    train_data, val_data = train_test_split(data, test_size=test_size, random_state=seed)   
    return train_data, val_data

### Loading Tokenizer, Config and Model

In [8]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [9]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  #load_model_path='pytorch_model.bin'
                  )

Special tokens added


In [10]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [16]:
train_data, val_data = split_data(data=data_df, 
                                  test_size=TEST_SIZE, 
                                  seed=SEED)

train_dataset = CommentGeneratorDataset(train_data, tokenizer)
val_dataset = CommentGeneratorDataset(val_data, tokenizer, randomize=False)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 5,572 samples for training, and 1,394 samples for validation testing'

### Fine-tune GPT2 using Trainer

In [17]:
# %%time

training_args = TrainingArguments(
    output_dir="model",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    # load_best_model_at_end=True,     
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()
trainer.save_model()    

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
***** Running training *****
  Num examples = 5572
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 348


Epoch,Training Loss,Validation Loss
0,No log,0.213963
1,No log,0.189727
2,No log,0.186127
3,No log,0.185579


***** Running Evaluation *****
  Num examples = 1394
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1394
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1394
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1394
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to model
Configuration saved in model/config.json
Model weights saved in model/pytorch_model.bin
tokenizer config file saved in model/tokenizer_config.json
Special tokens file saved in model/special_tokens_map.json


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
# Save to G-Drive ----------------------------------#
!cp -r 'model/pytorch_model.bin' 'drive/MyDrive/ai-bees/pytorch_model.bin'

In [16]:
# !cp -r 'drive/MyDrive/ai-bees/pytorch_model.bin' 'pytorch_model.bin'

In [44]:
!zip -r model.zip model

  adding: model/ (stored 0%)
  adding: model/training_args.bin (deflated 48%)
  adding: model/tokenizer_config.json (deflated 40%)
  adding: model/pytorch_model.bin (deflated 9%)
  adding: model/runs/ (stored 0%)
  adding: model/runs/Jun10_15-59-17_6bbc91189393/ (stored 0%)
  adding: model/runs/Jun10_15-59-17_6bbc91189393/events.out.tfevents.1654876757.6bbc91189393.656.0 (deflated 59%)
  adding: model/runs/Jun10_15-59-17_6bbc91189393/1654876757.4139614/ (stored 0%)
  adding: model/runs/Jun10_15-59-17_6bbc91189393/1654876757.4139614/events.out.tfevents.1654876757.6bbc91189393.656.1 (deflated 62%)
  adding: model/runs/Jun10_16-00-15_6bbc91189393/ (stored 0%)
  adding: model/runs/Jun10_16-00-15_6bbc91189393/1654876815.292745/ (stored 0%)
  adding: model/runs/Jun10_16-00-15_6bbc91189393/1654876815.292745/events.out.tfevents.1654876815.6bbc91189393.656.3 (deflated 62%)
  adding: model/runs/Jun10_16-00-15_6bbc91189393/events.out.tfevents.1654876815.6bbc91189393.656.2 (deflated 57%)
  adding:

### Generating text with Fine-tuned GPT-2 model

In [19]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='model/pytorch_model.bin')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": 

Special tokens added


loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50260,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "sep_token_id": 50261,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "sum

In [36]:
title = data_df['title'].tolist()[0]
ners = eval(data_df['ners'].tolist()[0])

ners = CommentGeneratorDataset.join_ners(ners, randomize=False)

prompt = SPECIAL_TOKENS['bos_token'] + ners + SPECIAL_TOKENS['sep_token'] + \
         title + SPECIAL_TOKENS['eos_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [None]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=10, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.3,        
                                temperature=0.2,
                                repetition_penalty=0.5,
                                num_return_sequences=2
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(text) + len(','.join(ners)) 
    print("{}: {}\n\n".format(i+1,  text[a:]))

In [26]:
MAXLEN

768

In [23]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(text) + len(','.join(ners))
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: 




### Generating text with raw GPT2

In [24]:
tokenizer = get_tokenier()
model = get_model(tokenizer)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": 

In [40]:
# title = data_df['title'].tolist()[0][:10]
# ners = eval(data_df['ners'].tolist()[0])

prompt = title + '  ' + ners

In [42]:
# prompt = title

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval()
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=50,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0: jets chairman christopher johnson won't fine players for anthem protests   - http://www.youtube.com/watch?v=Xq8xHmVfJ0k&feature=youtu.be
Posted by




In [29]:
prompt

'jets chair  '