In [1]:
import torch as th
import einops

# Huggingface rundown
Huggingface is a open-source ecosystem that provides API access to many pre-trained, open-source AI models algorithms and datasets. These are shared by the community (and therefore open-sourced).


If I want to upload something to huggingface (model, dataset, etc...) then I do need to follow a certain format/standard set by them. This is because they have are an API that is the middleman between the user and uploaded models/data. For this API to recognize my model/data I need to specify metadata. 


Most popular libraries are the **transformers**, **datasets** libraries. 

In [2]:
from transformers import GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the pretrained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Get the vocabulary dictionary: token -> id
vocab = tokenizer.get_vocab()

# Example: size and sample entries
print(len(vocab))          # usually 50257 for GPT-2
print(list(vocab.items())[10000:10020])

50257
[('Ġpocket', 10000), ('ĠInv', 10001), ('Download', 10002), ('ĠCrime', 10003), ('Ġbene', 10004), ('ĠGuide', 10005), ('ĠAssembly', 10006), ('Ġparameters', 10007), ('IE', 10008), ('ĠAlexander', 10009), ('Ġconcert', 10010), ('ĠSche', 10011), ('Ġshoes', 10012), ('Ġvisiting', 10013), ('Ġrecall', 10014), ('Ġbub', 10015), ('Ġrural', 10016), ('Ġconcrete', 10017), ('ĠRos', 10018), ('Next', 10019)]


{'input_ids': [15496, 11, 703, 389, 345, 30], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer(["Hello, how are you?", "Hello World"], padding=True, return_tensors="pt")

# The returned "attention_mask" object inicates which tokens returned by the tokenizer are actual data and which ones are paddings.
# This is because we always want to return a fixed size tensor for batch processing, so we pad the shorter sequences with special padding tokens.
# How this is eventually used is that we set the scores[i, j] = -inf if attention_mask[i, j] == 0 before applying softmax in the attention computation.

{'input_ids': tensor([[15496,    11,   703,   389,   345,    30],
        [15496,  2159, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 0, 0, 0, 0]])}