# **GPT 2**

The following notebook is inspired by and uses parts of these two huggingface notbooks:

* [Train your tokenizer from scratch](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb)
* [Train your language model from scratch](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch-tf.ipynb)

## **Imports**

In [None]:
import csv
from tqdm.auto import tqdm 

import tensorflow as tf

import transformers
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AdamWeightDecay
from transformers import GPT2TokenizerFast
from transformers import DefaultDataCollator
from transformers import AutoConfig, TFAutoModelForCausalLM
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, AddedToken


from datasets import Dataset
from datasets import load_dataset

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot') 

# Adapt this variable to the path of the cloned repository
path = "YourPathHere"

# For importing custom modules
import sys
sys.path.append(f'{path}/Modules')

Mount Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

## **Dataset**

In [None]:
from helper_functions import dataset_cleanup

In [None]:
data_path = f"{path}/Dataset/news_data_preprocessed.csv"
min_sent_len=10
max_sent_len=28

cleaned_data, max_seq_length = dataset_cleanup(data_path=data_path, 
                                                    min_sent_len=min_sent_len, 
                                                    max_sent_len=max_sent_len)

## **Tokenizer**

In [None]:
vocab_size=6803

tokenizer = Tokenizer(models.WordLevel())
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit(add_prefix_space=True)
trainer = trainers.WordLevelTrainer(vocab_size=vocab_size)
tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, add_prefix_space=True)
tokenizer.add_special_tokens({"eos_token": "<End>", "bos_token": "<Start>"})
tokenizer = tokenizer.train_new_from_iterator(cleaned_data, vocab_size=vocab_size, is_split_into_words=True)

## **Data Pipeline**

Last cleanup and convert data to dicts:

In [None]:
train_data = {'text': cleaned_data[:int(0.85*(len(cleaned_data)))]}
del train_data["text"][115814]
del train_data["text"][121909]

validation_data = {"text": cleaned_data[int(0.85*(len(cleaned_data))):]}

train_dataset = Dataset.from_dict(train_data)

validation_dataset = Dataset.from_dict(validation_data)

Tokenize already tokenized data:

(necessary step for huggingface transformer and doesn't alter the data)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], is_split_into_words=True)

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

Add labels for data collator and convert to tf dataset:

(They simply need to be a copy of the input ids, since the actual shifting is done at runtime by the collator)

In [None]:
def add_labels(result):
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
train_data = tokenized_train_dataset.map(
    add_labels,
    batched=True,
    batch_size=256,
    num_proc=1
)


validation_data = tokenized_validation_dataset.map(
    add_labels,
    batched=True,
    batch_size=256,
    num_proc=1
)

In [None]:
data_collator = DefaultDataCollator(return_tensors="tf")

train_dataset = train_data.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

validation_dataset = validation_data.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

## **Model**

Create randomly initialized GPT2-small model for language modelling:

In [None]:
config = AutoConfig.from_pretrained("gpt2", vocab_size=len(tokenizer), bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id)

News_GPT2 = TFAutoModelForCausalLM.from_config(config)

Model summary:

In [None]:
News_GPT2(News_GPT2.dummy_inputs)  
News_GPT2.summary()

## **Training**

Set some hyperparameters:

In [None]:
learning_rate = 2e-5
weight_decay = 0.0
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
News_GPT2.compile(optimizer=optimizer)

Calculate initial train and test losses:

In [None]:
train_losses = []
train_losses.append(News_GPT2.evaluate(train_dataset))

In [None]:
test_losses = []
test_losses.append(News_GPT2.evaluate(validation_dataset))

Train the model:

In [None]:
News_GPT2_losses = News_GPT2.fit(train_dataset, validation_data=validation_dataset, epochs=10)

Save the model:

In [None]:
News_GPT2.save_pretrained("GPT-2_Small")

Plot loss values for training and test:

In [None]:
for i in News_GPT2_losses.history['val_loss']:
    test_losses.append(i)
    
for i in News_GPT2_losses.history['loss']:
    train_losses.append(i)

plt.style.use('ggplot')
fig1, ax1 = plt.subplots(nrows=1, ncols=1, figsize = (10, 6))
ax1.plot(train_losses, label='training')
ax1.plot(test_losses, label='test')
ax1.set(ylabel='Loss', xlabel='Epochs', title=f'Average loss over 10 epochs')
ax1.legend()
plt.savefig(f"news_gpt2_loss_plot10_transparent", dpi=500.0, format="png", transparent=True)
plt.savefig(f"news_gpt2_loss_plot10", dpi=500.0, format="png")
plt.show()

## **Evaluation**

Import evaluation module:

In [None]:
import evaluation

Load the trained model:

In [None]:
News_GPT2 = TFAutoModelForCausalLM.from_pretrained("GPT-2_Small")

Create csv containing sentences for InferSent:

In [None]:
sentences = []

generator_truncated = pipeline("text-generation", model=News_GPT2, tokenizer=tokenizer, return_full_text=False)
sents = generator_truncated("<Start> ", max_length=max_seq_length, pad_token_id=4, num_return_sequences=10000, return_tensors=True)['generated_token_ids']

for sent in sents:
    temp = []
    for token_id in sent[1:]:
        if token_id == 4:
            break
        temp.append(tokenizer.decode(token_id))
    sentences.append(temp)
    
with open(f"{path}/Evaluation/FID/GPT-2_Small_InferSent.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(sentences)

In [None]:
sentences = []
generator_truncated = pipeline("text-generation", model=News_GPT2, tokenizer=tokenizer, return_full_text=False)
sents = generator_truncated("yesterday", max_length=max_seq_length, pad_token_id=4, num_return_sequences=1, return_tensors=True)['generated_token_ids']

for sent in sents:
    temp = []
    for token_id in sent[1:]:
        if token_id == 4:
            break
        temp.append(tokenizer.decode(token_id))
    sentences.append(temp)
    
print(sentences)

Generate Sentences:

In [None]:
evaluation.generate_sentences(model=News_GPT2, 
                              index_decoder=tokenizer.decode, 
                              print_sentences=True, 
                              tokenizer=tokenizer, 
                              model_name="News_GPT2", 
                              num_sent=10)

Average sentence length:

In [None]:
evaluation.generate_sentences(model=News_GPT2, 
                              index_decoder=tokenizer.decode, 
                              print_sentences=False, 
                              tokenizer=tokenizer, 
                              model_name="News_GPT2", 
                              num_sent=10000)

Prepare the reference data used for Bleu, Self-Bleu and Word Frequency calculations:

In [None]:
reference_data = []

for sent in validation_data["input_ids"]:
    temp = []
    for token_id in sent[1:]:
        if token_id == 4:
            break
        temp.append(tokenizer.decode(token_id))
    reference_data.append(temp)

Prepare the reference data used for Bleu, Self-Bleu and Word Frequency calculations:

In [None]:
jsd_sents, jsd_words = evaluation.js_distance(model=News_GPT2, 
                                              index_decoder=tokenizer.decode, 
                                              reference_data=reference_data, 
                                              tokenizer=tokenizer, 
                                              model_name="News_GPT2", 
                                              max_seq_length=max_seq_length)

print(f"Jensen-Shannon distance for the sentence length frequencies: {jsd_sents}")
print(f"Jensen-Shannon distance for the word counts: {jsd_words}")

Calculate Bleu-4 Score:

In [None]:
evaluation.bleu_score(model=News_GPT2, 
                      index_decoder=tokenizer.decode, 
                      reference_data=reference_data, 
                      tokenizer=tokenizer, 
                      model_name="News_GPT2", 
                      num_sent=10000, 
                      n_grams=4, 
                      max_seq_length=max_seq_length)

Calculate Self-Bleu-4 Score:

In [None]:
evaluation.self_bleu_score(model=News_GPT2, 
                           index_decoder=tokenizer.decode, 
                           tokenizer=tokenizer, 
                           model_name="News_GPT2", 
                           num_sent=10000, 
                           n_grams=4, 
                           max_seq_length=max_seq_length)

Count Word Frequency:

In [None]:
top_k = 12
ref_freq, gen_freq = evaluation.word_freq(model=News_GPT2, 
                                          index_decoder=tokenizer.decode, 
                                          reference_data=reference_data, 
                                          tokenizer=tokenizer, 
                                          model_name="News_GPT2", 
                                          max_seq_length=max_seq_length)

In [None]:
list(ref_freq.items())[:top_k]

In [None]:
list(gen_freq.items())[:top_k]

Word frequency plot:

In [None]:
save_path = "YourPathHere"
evaluation.word_freq_plots(reference_freq_dict=ref_freq, 
                           generated_freq_dict=gen_freq, 
                           top_k=top_k,
                           save_plots=False, 
                           save_path=save_path)