In [1]:
from transformers import pipeline, set_seed
generator = pipeline("text-generation", model="gpt2")
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, but what I'm really doing is making a human-readable document. There are other languages, but those are"},
 {'generated_text': "Hello, I'm a language model, not a syntax model. That's why I like it. I've done a lot of programming projects.\n"},
 {'generated_text': "Hello, I'm a language model, and I'll do it in no time!\n\nOne of the things we learned from talking to my friend"},
 {'generated_text': "Hello, I'm a language model, not a command line tool.\n\nIf my code is simple enough:\n\nif (use (string"},
 {'generated_text': "Hello, I'm a language model, I've been using Language in all my work. Just a small example, let's see a simplified example."}]

In [2]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)


In [10]:
import datasets
train_10 = datasets.load_dataset("roneneldan/TinyStories", split="train[:10%]")
len(train_10)


Repo card metadata block was not found. Setting CardData to empty.


211972

In [5]:
def tokenization(example):
    return tokenizer(
        example["text"],
        max_length=model.config.max_position_embeddings,
        truncation=True,
        return_attention_mask=False,
    )


# Batched tokenization. Can feed into model for outputs/logits to be softmaxed.
ds = train_10.map(
    tokenization,
    batched=True,
    num_proc=8,
)


Map (num_proc=8):   0%|          | 0/211972 [00:00<?, ? examples/s]

In [6]:
ds


Dataset({
    features: ['text', 'input_ids'],
    num_rows: 211972
})

In [25]:
ds[0]["text"], ds[0]["input_ids"]


('One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.',
 [3198,
  1110,
  11,
  257,
  1310,
  2576,
  3706,
  20037,
  1043,
  257,
  17598,
  287,
  607,
  2119,
  13,
  1375,
  2993,
  340,
  373,
  2408,
  284,
  711,
  351,
  340,
  780,
  340,
  373,
  7786,
  13,
  20037,
  2227,
  284,
  2648,
  262,
  17598,
  351,
  607,
  1995,
  11

In [9]:
len(ds[0]["input_ids"]), len(ds[0]["text"].split(" "))


(162, 132)

In [11]:
encoded_input


{'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [33]:
tokenizer("Replace"), tokenizer("me"), tokenizer("by"), tokenizer("Replace me by"), tokenizer


Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


({'input_ids': [3041, 5372], 'attention_mask': [1, 1]},
 {'input_ids': [1326], 'attention_mask': [1]},
 {'input_ids': [1525], 'attention_mask': [1]},
 {'input_ids': [3041, 5372, 502, 416], 'attention_mask': [1, 1, 1, 1]},
 GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
 })

In [24]:
tokenizer.convert_ids_to_tokens([3041, 5372, 502, 416, 597, 2420]), tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0])


(['Re', 'place', 'Ġme', 'Ġby', 'Ġany', 'Ġtext'],
 ['Re', 'place', 'Ġme', 'Ġby', 'Ġany', 'Ġtext', 'Ġyou', "'d", 'Ġlike', '.'])

In [26]:
generator("One day, a little girl named Lily found a needle in her room.", max_length=30, num_return_sequences=5)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'One day, a little girl named Lily found a needle in her room. She jumped out, grabbed a few items, pulled up the cover, got'},
 {'generated_text': 'One day, a little girl named Lily found a needle in her room. It was an old tube in her back, with her eyes closed, her'},
 {'generated_text': 'One day, a little girl named Lily found a needle in her room. She immediately had a good dream — a very different one.\n\nHer'},
 {'generated_text': 'One day, a little girl named Lily found a needle in her room. She was taken to the hospital and treated with antibiotics.\n\nShe has'},
 {'generated_text': 'One day, a little girl named Lily found a needle in her room.\n\n"What," she said, holding out her hand to stop the'}]

In [44]:
import numpy as np
import scipy
tmp = output.last_hidden_state[0][0].detach().numpy()
output.last_hidden_state.shape, tmp.max(), tmp.min(), ((tmp > -1) & (tmp < 1)).sum(), scipy.special.softmax(tmp).sum(), scipy.special.softmax(tmp).argmax()


(torch.Size([1, 10, 768]), 46.90026, -12.414162, 755, 1.0, 496)

In [65]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("gpt2", return_dict_in_generate=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
input_ids = tokenizer("This is a scary story.", return_tensors="pt").input_ids
# Adjust beams.
outputs = model.generate(input_ids, num_beams=2, num_return_sequences=2, output_scores=True, length_penalty=0)
outputs


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


BeamSearchDecoderOnlyOutput(sequences=tensor([[ 1212,   318,   257, 14343,  1621,    13,   198,   198,    40,  1101,
           407,  1654,   644,   284,   787,   286,   340,    13,   198,   198],
        [ 1212,   318,   257, 14343,  1621,    13,   198,   198,    40,  1101,
           407,  1654,   644,   284,   787,   286,   340,    13,   314,  1101]]), sequences_scores=tensor([-17.3244, -19.0733]), scores=(tensor([[-12.6087, -11.0740, -12.2497,  ..., -23.2802, -27.1528,  -5.2005],
        [-12.6087, -11.0740, -12.2497,  ..., -23.2802, -27.1529,  -5.2005]]), tensor([[-13.2075, -12.2395, -17.4515,  ..., -21.1790, -14.0837, -14.2907],
        [-15.6107,  -8.7284, -13.3961,  ..., -32.1316, -39.7618, -10.3586]]), tensor([[-12.7155,  -2.6471,  -6.8343,  ..., -26.4551, -30.0658, -13.9922],
        [-14.3286, -12.3563, -17.0929,  ..., -19.5023, -18.9123, -14.7603]]), tensor([[-13.0612,  -8.4863, -10.2133,  ..., -22.5126, -26.4204,  -9.6500],
        [-12.1657, -11.5912, -15.1831,  ..., -20.

In [66]:
tokenizer.convert_ids_to_tokens(outputs["sequences"][0]), tokenizer.convert_ids_to_tokens(outputs["sequences"][1]), len(outputs["sequences"][0])


(['This',
  'Ġis',
  'Ġa',
  'Ġscary',
  'Ġstory',
  '.',
  'Ċ',
  'Ċ',
  'I',
  "'m",
  'Ġnot',
  'Ġsure',
  'Ġwhat',
  'Ġto',
  'Ġmake',
  'Ġof',
  'Ġit',
  '.',
  'Ċ',
  'Ċ'],
 ['This',
  'Ġis',
  'Ġa',
  'Ġscary',
  'Ġstory',
  '.',
  'Ċ',
  'Ċ',
  'I',
  "'m",
  'Ġnot',
  'Ġsure',
  'Ġwhat',
  'Ġto',
  'Ġmake',
  'Ġof',
  'Ġit',
  '.',
  'ĠI',
  "'m"],
 20)

In [68]:
outputs["scores"], len(outputs["scores"]), outputs["scores"][0].shape, outputs["scores"][0][0].argmax(), tokenizer.convert_ids_to_tokens([outputs["scores"][0][0].argmax().item()]), tokenizer.convert_ids_to_tokens([outputs["scores"][1][0].argmax().item()]), [tokenizer.convert_ids_to_tokens([outputs["scores"][i][0].argmax().item()]) for i in range(5)]


((tensor([[-12.6087, -11.0740, -12.2497,  ..., -23.2802, -27.1528,  -5.2005],
          [-12.6087, -11.0740, -12.2497,  ..., -23.2802, -27.1529,  -5.2005]]),
  tensor([[-13.2075, -12.2395, -17.4515,  ..., -21.1790, -14.0837, -14.2907],
          [-15.6107,  -8.7284, -13.3961,  ..., -32.1316, -39.7618, -10.3586]]),
  tensor([[-12.7155,  -2.6471,  -6.8343,  ..., -26.4551, -30.0658, -13.9922],
          [-14.3286, -12.3563, -17.0929,  ..., -19.5023, -18.9123, -14.7603]]),
  tensor([[-13.0612,  -8.4863, -10.2133,  ..., -22.5126, -26.4204,  -9.6500],
          [-12.1657, -11.5912, -15.1831,  ..., -20.0144, -13.1187, -12.7388]]),
  tensor([[-13.2222, -10.9737, -17.2112,  ..., -20.3124, -13.6227, -14.1369],
          [-13.7217, -12.3673, -15.8846,  ..., -19.0205, -18.6040, -13.4203]]),
  tensor([[-12.2772, -12.5668, -21.4743,  ..., -18.6896, -20.5301, -13.5424],
          [-13.9401, -11.3949, -16.4059,  ..., -18.3805, -18.4310, -14.6433]]),
  tensor([[ -9.8580, -10.6353, -18.2077,  ..., -17.4