# H.P. Lovecraft Language Generation using GPT-2
## Dr. Tristan Behrens (https://www.linkedin.com/in/dr-tristan-behrens-734967a2/)

Trains a Neural Network on the collected works by H.P. Lovecraft.

# TODOs.

- [ ] Do I need sequenze length in tokenizers?
- [ ] Is the model big enough?
- [ ] Is the implementation sound?

- https://towardsdatascience.com/train-gpt-2-in-your-own-language-fc6ad4d60171
- https://discuss.huggingface.co/t/gpt2-training-from-scratch-in-german/1157

In [1]:
!pip install --upgrade transformers datasets

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.4 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 75.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 52.6 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 63.6 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB

## Count the number of available GPUs.

In [None]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

## Download the corpus.

In [None]:
import os

if not os.path.exists("lovecraftcorpus"):
    !git clone https://github.com/vilmibm/lovecraftcorpus

In [None]:
!ls lovecraftcorpus

## Create the tokenizer.

In [None]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.pre_tokenizers import Whitespace
from transformers import GPT2TokenizerFast
import os
import glob

# Get the paths.
# TODO: Is this enough data?
paths = glob.glob("lovecraftcorpus/*.txt")
print(paths)

# Create the tokenizer.
tokenizer = ByteLevelBPETokenizer()
tokenizer.pre_tokenizer = Whitespace()

# Train the tokenizer.
# TODO: Do we need the endoftext token?
print("Training tokenizer...")
tokenizer.train(
    files=paths, 
    vocab_size=10000, 
    min_frequency=2, 
    special_tokens=["<|endoftext|>"])
print("Done.")

# Get sequence length max of 1024
tokenizer.enable_truncation(max_length=1024)

# Save tokenizer.
tokenizer_path = "tokenizer"
if not os.path.exists(tokenizer_path):
    os.mkdir(tokenizer_path)
tokenizer.save_model(tokenizer_path)

# Turn into pretrained tokenizer that can be used during training.
tokenizer = GPT2TokenizerFast.from_pretrained(
    tokenizer_path, 
    pad_token="<|endoftext|>")
tokenizer.model_max_length = 1024

## Tokenize some text.

In [None]:
text = "Today is a great day to be creative."
indices = tokenizer.encode(text)
len(text), len(indices), indices

## Prepare the datasets.

In [None]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling

# Split dataset into train and test.
split_index = int(0.9 * len(paths))
train_paths = paths[:split_index]
test_paths = paths[split_index:]
dataset = load_dataset("text", data_files={"train": train_paths, "test": test_paths})

# Inspect.
print("Dataset:", dataset)
print("Sample:", dataset["train"][10])

# Tokenize the dataset.
dataset = dataset.map(lambda batch: tokenizer(batch["text"], truncation=True, padding=True), batched=True)

# Set pytorch format.
dataset.set_format("torch", columns=["input_ids", "attention_mask"])

# Inspect.
print("Dataset:", dataset)
print("Sample:", dataset["train"][10])

# Create a data collator.
# TODO: Is this really necessary?
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

## Create the model.

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

# creating the configurations from which the model can be made
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_head=8,
    n_layer=6,
    n_embd=512,
    n_positions=1024
)
#config = GPT2Config(
#    vocab_size=tokenizer.vocab_size,
#    bos_token_id=tokenizer.bos_token_id,
#    eos_token_id=tokenizer.eos_token_id,
#    n_head=4,
#    n_layer=4,
#    n_embd=128,
#    n_positions=512
#)
print(config)

# creating the model
model = GPT2LMHeadModel(config)

## Train the model.

In [None]:
from transformers import Trainer, TrainingArguments

training_arguments = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=10,
    eval_steps=500,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=20,
)
print(training_arguments)

trainer = Trainer(
    model=model,
    args=training_arguments,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()

## Generate some text.

In [None]:
text = "A"
input_ids = tokenizer.encode(text, return_tensors='pt').cuda()# getting out output
beam_outputs = model.generate(
  input_ids,
  max_length = 100,
  #num_beams = 5,
  temperature = 1.01,
  #no_repeat_ngram_size=2,
  #num_return_sequences=5
)

for beam_output in beam_outputs:
    print(tokenizer.decode(beam_output))
    print("")