In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.17 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.10.2


# Import required libraries

In [2]:
import numpy as np
from transformers import GPT2LMHeadModel , GPT2Tokenizer

# Download tokens and models

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large') 
model = GPT2LMHeadModel.from_pretrained('gpt2-large', pad_token_id = tokenizer.eos_token_id)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

# Test encoder and decoder

In [5]:
# Encode text and return torch tensors 'pt' (PyTorch tensors)
# Converts words into numbers (indices)
topic = "The weather is nice"
input_ids = tokenizer.encode(topic, return_tensors = 'pt')
input_ids

tensor([[ 464, 6193,  318, 3621]])

In [6]:
# Decode text from indices
tokenizer.decode(input_ids[0])

'The weather is nice'

# Generate text 

## Beam Search generation

In [None]:
# Generate text using generate function from GPT2LMHeadModel via beam search
# https://huggingface.co/blog/how-to-generate
# Args: max_length: maximum number of words in generated text
#       num_beams: beam search reduces risk of missing hidden high probability word sequences by keeping the most
#                  likely num_beams of hypotheses at each time step and eventually choosing the hypothesis that has 
#                  the overall highest probability
#       no_repeat_ngram_size: while result is arguably more fluent, output still includes repetition of same word seqs
#                  introduce n-grams (word seqs of n words) penalties
output_beam = model.generate(input_ids, max_length = 500, num_beams = 5, 
                        no_repeat_ngram_size  = 2, early_stopping = True)

In [None]:
print(tokenizer.decode(output_beam[0], skip_special_tokens = True))

## Top-K sampling

In [None]:
# Set random seed
np.random.seed(0)

In [None]:
# Generate text using generate function from GPT2LMHeadModel via Top-K sampling
# K most likely next words are filtered and probability mass is redistributed among only those K next words
# Method adopted by GPT2
output_topk = model.generate(input_ids, do_sample=True, max_length = 500, top_k=50)

In [None]:
print(tokenizer.decode(output_topk[0], skip_special_tokens = True))