This notebook is designed to be run either locally or in Google Colab. For usage in Google Colab, the following files/folders should be uploaded to the default directory:
- utils.py
- /data/
    
Resources:
- https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_clm.py
- https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py
- https://huggingface.co/transformers/main_classes/trainer.html
- https://huggingface.co/transformers/training.html
- https://colab.research.google.com/drive/13dZVYEOMhXhkXWfvSMVM1TTtUDrT6Aeh?usp=sharing#scrollTo=D6TKgyUzPIQc
- https://towardsdatascience.com/train-gpt-2-in-your-own-language-fc6ad4d60171
- https://jalammar.github.io/illustrated-gpt2/
- https://jalammar.github.io/illustrated-transformer/
- https://nlp.seas.harvard.edu/2018/04/03/attention.html
- https://towardsdatascience.com/fine-tuning-gpt2-for-text-generation-using-pytorch-2ee61a4f1ba7

# Install Prerequisites

In [None]:
!mkdir data/bbc/tech
!mkdir src/

In [1]:
import os
import sys
if 'google.colab' in str(get_ipython()):
    !pip install datasets
    !pip install transformers
else:
    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))
    
from importlib import reload
import src.utils
reload(src.utils)

from src.utils import DataReader

# Load Data

In [None]:
from src.utils import DataReader
# from src.utils import pass_sliding_window

In [None]:
text = DataReader.read_bbc_tech()
text = ' '.join(sentence for sentence in text) # join into large string
text = text.split(' ') # split into words

# Load Model
Must load before before continiuing with data prep since we use the tokeniser in the data prep section.

In [2]:
from src.utils import TransformerLoader

In [3]:
model = 'gpt2'
tokenizer, model = TransformerLoader.from_huggingface(model, framework='pt')

# Prepare Data

In [None]:
from datasets import Dataset
import math
import random
from sklearn.model_selection import train_test_split

In [None]:
# split into seq_length lists of tokens
seq_length = 10
features = [text[x:x+seq_length] for x in range(0, len(text), seq_length)] # chunk string
print('{} non-overlapping sequences of length {} generated.'.format(len(features), seq_length))

In [None]:
# split into train test val
features_train, _ = train_test_split(features, shuffle=True, random_state=0, train_size=0.90)
features_test, features_val = train_test_split(_, shuffle=True, random_state=0, train_size=0.5)
print('{} rows in the train dataset.'.format(len(features_train)))
print('{} rows in the test dataset.'.format(len(features_test)))
print('{} rows in the validation dataset.'.format(len(features_val)))

In [None]:
# tokenise
def tokenize(features):
    features = tokenizer(features, padding='max_length', max_length=seq_length, is_split_into_words=True, truncation=True)
    features['labels'] = features['input_ids'].copy()
    return features

features_train = tokenize(features_train)
features_test = tokenize(features_test)
features_val = tokenize(features_val)

In [None]:
# build dataset
dataset_train = Dataset.from_dict(features_train)
dataset_test = Dataset.from_dict(features_test)
dataset_val = Dataset.from_dict(features_val)

In [None]:
dataset_train

# Train Model

In [None]:
from transformers import Trainer, TFTrainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
#     num_train_epochs=3,              # total # of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs,
    run_name='hf-pt-bbc-tech',
)


# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer,
)

trainer.train(
#     model_path='./lol'
)

In [None]:
trainer.evaluate()

# Generate new sequences

In [None]:
# user_text = 'Well hello there miss turnip, what a'
# user_text = 'And the stock market has hit'
# user_text = 'not many companies can'
# user_text = 'Then the little leaves'
user_text = 'and the current generation of mobiles using flash technology can'

In [None]:
# sequences = tokenizer.encode(user_text, return_tensors='pt')
# model.to('cuda')
# sequences = sequences.to('cuda')

In [None]:
# output_sequences = model.generate(
#   # input_ids=sequences['input_ids'],
#   input_ids=sequences,
#   max_length=50,
#   # temperature=args.temperature,
#   # top_k=args.k,
#   # top_p=args.p,
#   # repetition_penalty=args.repetition_penalty,
#   do_sample=True,
#   # num_return_sequences=0,
# )

In [None]:
# # Remove the batch dimension when returning multiple sequences
# if len(output_sequences.shape) > 2:
#     output_sequences.squeeze_()

# generated_sequences = []

# for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
#     print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
#     generated_sequence = generated_sequence.tolist()

#     # Decode text
#     text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

#     print(text)

In [6]:
def generate_sequence(text, model, tokenizer):
    sequences = tokenizer.encode(text, return_tensors='pt')
    try:
        model.to('cuda')
        sequences = sequences.to('cuda')
    except Exception as ex:
        print(ex)
        
    output_sequences = model.generate(
      input_ids=sequences,
      max_length=50,
      # temperature=args.temperature,
      # top_k=args.k,
      # top_p=args.p,
      # repetition_penalty=args.repetition_penalty,
      do_sample=True,
      # num_return_sequences=0,
    )
    
    # Remove the batch dimension when returning multiple sequences
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        print(text)

In [None]:
user_text = 'and the current generation of mobiles using flash technology can'
generate_sequence(user_text, model, tokenizer)

In [None]:
user_text = 'and the current generation of mobiles using flash technology can'
generate_sequence(user_text, model, tokenizer)

In [4]:
tokenizer_base, model_base = TransformerLoader.from_huggingface('gpt2', framework='pt')

In [7]:
user_text = 'and the current generation of mobiles using flash technology can'
generate_sequence(user_text, model_base, tokenizer_base)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


Torch not compiled with CUDA enabled
=== GENERATED SEQUENCE 1 ===
 and the current generation of mobiles using flash technology can be considered to have been produced, but since most of the flash used today are for smartphones, that does not necessarily mean that all mobiles were made by a single vendor. (It could be
