In [1]:
!pip install tiktoken emoji --no-cache-dir



In [2]:
# init the GPT-4 Tokenizer
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.n_vocab) # number of tokens in total

100277


In [3]:
# init the emojis
import emoji
emojis = list(emoji.EMOJI_DATA.keys())
import random
random.seed(15)
random.shuffle(emojis)
print(len(emoji.EMOJI_DATA)) # number of possible emoji

5034


In [4]:
print(len(emojis))
print(emojis[:10]+ emojis[-10:])

5034
['🧎\u200d♀', '📏', '🤾🏻\u200d♀️', '🧑🏾\u200d🦼\u200d➡️', '🍏', '🙍\u200d♀️', '✌🏿', '🤙🏻', '🈴', '🧑\u200d🦼\u200d➡️', '🦜', '🧯', '🍌', '🇯🇲', '👷🏼\u200d♂️', '👧', '🇵🇲', '🤦\u200d♀', '🇪🇹', '👩🏿\u200d❤️\u200d💋\u200d👩🏼']


In [5]:
def text_to_tokens(text, max_per_row=10):
    ids = enc.encode(text)
    unique_tokens = set(ids)

    # map all tokens we see to a unique emoji
    id_to_emoji = {id: emoji for emoji, id in zip(emojis, unique_tokens)}


    # do the translatation
    lines = []
    for i in range(0, len(ids), max_per_row):
        lines.append(''.join([id_to_emoji[id] for id in ids[i:i+max_per_row]]))
    out = '\n'.join(lines)
    return out

In [6]:
text = """Words vs Tokens
A word is most likely what you think it is - the most simple form or unit of language as understood by humans. In the sentence, “I like cats”, there are three words - “I”, “like”, and “cats.” We can think of words as the primary building blocks of language; the fundamental pieces of language that we are taught from a very young age.

A token is a bit more complex. Tokenization is the process of converting pieces of language into bits of data that are usable for a program, and a tokenizer is an algorithm or function that performs this process, i.e., takes language and converts it into these usable bits of data. Thus, a token is a unit of text that is intentionally segmented for a large language model to process efficiently. These units can be words or any other subset of language - parts of words, combinations of words, or punctuation.

There are a variety of different tokenizers out there which reflect a variety of trade offs. Well-known tokenizers include NLTK (Natural Language Toolkit), Spacy, BERT tokenizer and Keras. Whether or not to select one of these or a different tokenizer depends upon your specific use case. On average, there are roughly 0.75 words per token, but there can be meaningful differences among tokenizers."""

print(text_to_tokens(text, max_per_row=15))
print(enc.encode(text))
print(len(enc.encode(text)))

🧍🏿🏃🏾‍➡👩🏻‍🤝‍👩🏼🕴🏾🏋🏿🚶🏼‍♂️‍➡🏋🏾‍♀🇼🇸🤝🏾❎🫷👩🏾‍🦯🖖🏾🏋🏾‍♀👨🏻‍🦲
🏌‍♂️🇼🇸👳‍♂️🧑🏽‍❤‍🧑🏾🫁🧑🏽‍🤝‍🧑🏽🎗️🔓👱‍♀👩🏻‍❤️‍💋‍👨🏾🪱🧵🤾🏻‍♀️🧑🏽‍⚖🏌‍♂️
🔣🧎‍♀🧙🏽‍♀️👮🏾🇪🇷🤙🏻🧑🏼‍❤️‍💋‍🧑🏾🤎🍏👨‍👩‍👦🧑🏼‍🏫👨🏻‍🦲🧙🏽‍♀️👮🏾🧑🏼‍❤️‍💋‍🧑🏾
🧙🏽‍♀️🧑🏾‍🦯‍➡🧑🏼‍❤️‍💋‍🧑🏾👨🏾‍🦲🧙🏽‍♀️🍱🧚🏻‍♀️👩🏻‍❤‍💋‍👩🏾🇳🇫👩🏾‍🦯🎗️🧑🏼‍🏫👱‍♀🏌‍♂️📏
👩🏻‍🦲↕️🎗️🔓🧑‍🦼‍➡️🏌‍♂️👨🏿‍❤️‍💋‍👨🏻👩🏿‍❤️‍👩🏻🎗️🔓🇷🇪🇨🇦🍏👁️‍🗨️🏃🏽‍♂️‍➡
🖲☪️👱🏾‍♂🚶🏾‍♀‍➡🤫🏋🏿🇬🇹🏋🏾‍♀🖲🌫🧑🏻‍🦯‍➡️🎅🏼🤾🏻‍♀️🏄🏾‍♂✌🏿
🏋🏾‍♀🏌‍♂️🔃🎗️🧑🏾‍❤‍💋‍🧑🏿👩🏿‍❤️‍👩🏻🎗️🔓🥘🧑🏻‍🎓🎗️📮🇷🇪🍏🕵🏼‍♀️
☺🖲🈴🧎‍♀👨🏾‍🦲🖲📕🏋🏾‍♀🌶🫱🫁🇫🇷🇷🇪😕👩‍🦽‍➡
🔃🧎‍♀🦸🏾‍♂️🧑🏽‍⚕🤸🏾‍♂⏰🔓👨🏾‍🦲🧔🏼🖖🏾🥘🏊🕵🏼‍♀️🧑🏻‍🎓🎗️
📮🤾🏻‍♀️💇🏾🧎‍♀🖲🇬🇹🏋🏾‍♀🖲🧑🏽‍🤝‍🧑🏽🎗️🚶🏻‍♀‍➡🇷🇪🏋🏾‍♀🧔🏽‍♀👩🏿‍❤️‍💋‍👨🏻
☺🖲🏌🔓🧎🏾‍♀‍➡🇰🇾🔃〰️🤾🏻‍♀️👨🏽‍❤‍👨🏽🗜️🇳🇫🕓🧑🏼‍🏫🫁
🥖🏄🏼💆🏽‍♀️🎗️🔓👨🏻‍🦲🤲🏿🎗️🧑🏼‍🏫🧎‍♀♀️🎗️🧑🏼‍🏫🧎‍♀🫁
🧑‍🍼🤫🇦🇫🍏🖲🧑🏾‍🦼‍➡️🎗️🌭🇬🇹♑🔶🤎🛸🧑🏼‍🦽‍➡️🖲
🧑🏾‍🦼‍➡️🎗️🤾‍♀🌱🤾🏻‍♀️👔🧗🏼‍♀️🇬🇹♑🇵🇾🇹🇯👩🏿‍🎓📚🙋🏻‍♀🏾
🙇🏿‍♂️📆🧚🏿‍♂✋🏼🧎‍♀👨🏽‍🌾🤸🏼‍♀️📕👨🏾‍🦲👨‍🦽‍➡️🧗🏾‍♂🤾🏻‍♀️🇳🇨🫁🧑‍🦽
🇰🇾⏳🤵🏼‍♀️🎗️🏊🫁🖲🌭📕👨‍🍼🗣👩🏿‍🦯🪖🏃🏿‍♀‍➡️🧗🏻‍♀
🤾🏻‍♀️🖊🦹🏼‍♂️🧎‍♀🤎🍏🚶🏿‍♂️🤵‍♀🙍‍♀️🤾🏻‍♀️💂🧑🏼‍🏫🪤🇬🇹🧎‍♀
💌🤎🇳🇫🕓⬆️👩🏾‍🦳🙅‍♀🇬🇹♑🤾🏻‍♀️
[24390, 6296, 59266, 198, 32, 3492, 374, 1455, 4461, 1148, 499, 1781, 433, 374, 482, 279, 1455, 4382, 1376, 477, 5089, 315, 4221, 439, 16365, 555, 12966, 13, 763, 279, 11914, 11, 1054, 4

In [7]:
text = """How many letters 'r' in the word 'strawberry'?"""
print(text_to_tokens(text, max_per_row=20))
print(len(enc.encode(text)))
print(enc.encode(text))

🧑‍🦽👩🏿‍❤️‍💋‍👨🏻🤾🏻‍♀️🙍‍♀️🤙🏻🧑🏾‍🦼‍➡️✌🏿💂📏🙍‍♀️🈴🧎‍♀🍏🧑‍🦼‍➡️
14
[4438, 1690, 12197, 364, 81, 6, 304, 279, 3492, 364, 496, 675, 15717, 71090]
