In [19]:
from transformers import AutoTokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [20]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [23]:
example = "My name is Pranav Jha. I love coding"
encoding = tokenizer(example)

In [24]:
print(encoding.tokens())

['[CLS]', 'My', 'name', 'is', 'P', '##rana', '##v', 'J', '##ha', '.', 'I', 'love', 'coding', '[SEP]']


In [25]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 4, 4, 5, 6, 7, 8, None]

In [26]:
encoding.sequence_ids()

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None]

In [27]:
encoding.word_to_chars(3)

CharSpan(start=11, end=17)

In [28]:
example[11:17]


'Pranav'

In [29]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 12),
 (12, 16),
 (16, 17),
 (18, 19),
 (19, 21),
 (21, 22),
 (23, 24),
 (25, 29),
 (30, 36),
 (0, 0)]

In [30]:
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796021,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [31]:
from transformers import AutoTokenizer,AutoModelForTokenClassification
checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint,)
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
encode = tokenizer(example,return_tensors="pt")
output = model(**encode)

In [33]:
output.logits.shape

torch.Size([1, 14, 9])

In [34]:
labels = model.config.id2label
labels

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [35]:
import torch

probabilities = torch.nn.functional.softmax(output.logits, dim=-1)[0].tolist()
predictions = output.logits.argmax(dim=-1)[0].tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0]


In [36]:
results = []
tokens = encode.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

print(results)

[{'entity': 'I-PER', 'score': 0.9993104934692383, 'word': 'P'}, {'entity': 'I-PER', 'score': 0.9922254085540771, 'word': '##rana'}, {'entity': 'I-PER', 'score': 0.9980796575546265, 'word': '##v'}, {'entity': 'I-PER', 'score': 0.999276340007782, 'word': 'J'}, {'entity': 'I-PER', 'score': 0.9982074499130249, 'word': '##ha'}]


In [37]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 12),
 (12, 16),
 (16, 17),
 (18, 19),
 (19, 21),
 (21, 22),
 (23, 24),
 (25, 29),
 (30, 36),
 (0, 0)]

In [38]:
example[12:16]

'rana'

In [39]:
print(tokenizer.convert_ids_to_tokens(encode.input_ids.flatten()))

['[CLS]', 'My', 'name', 'is', 'P', '##rana', '##v', 'J', '##ha', '.', 'I', 'love', 'coding', '[SEP]']


In [40]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove the B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Grab all the tokens labeled with I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

print(results)

[{'entity_group': 'PER', 'score': 0.9974198698997497, 'word': 'Pranav Jha', 'start': 11, 'end': 21}]


In [41]:
example[11:21]

'Pranav Jha'

In [42]:
## Fast tokenizers in the QA pipeline

In [43]:
from transformers import pipeline

question_answerer = pipeline("question-answering")
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.9802601933479309,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [44]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

In [45]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-cased-distilled-squad', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [46]:
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
print("Context length : ",len(context.split()))
print("Tokens Length : ",len(tokenizer.tokenize(context)))


Context length :  40
Tokens Length :  56


In [47]:
result = model(**tokenizer(question,context,return_tensors='pt'))

In [48]:
result.start_logits.shape

torch.Size([1, 67])

In [49]:
result.end_logits.shape

torch.Size([1, 67])

In [107]:
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

Hugging Face is a technology company based in New York City. It is known for its contributions to the field of natural language processing (NLP) and its open-source library, Transformers. The Transformers library provides pre-trained models that have achieved state-of-the-art performance on various NLP tasks, such as text classification, language generation, and named entity recognition.

Hugging Face was founded in 2016 by Clément Delangue, Julien Chaumond, and Thomas Wolf. The company has gained significant recognition for democratizing access to powerful NLP models through its user-friendly API and easy-to-use Python library. Developers and researchers around the world use Hugging Face's models to build a wide range of applications, from chatbots to language translation systems.

The Transformers library includes a diverse set of models, each designed for specific NLP tasks. Some of the popular models in the library include BERT (Bidirectional Encoder Representations from Transformers), GPT (Generative Pre-trained Transformer), RoBERTa, and T5 (Text-to-Text Transfer Transformer). These models have been fine-tuned on large datasets and can be further fine-tuned on domain-specific data to achieve even better performance.

Hugging Face's commitment to open-source and collaboration has made it a central hub for the NLP community. The library has a vast user base, and the company actively engages with developers and researchers through its forums, tutorials, and educational resources. Hugging Face continues to push the boundaries of NLP research and innovation, making it a leading force in the field.

The company's headquarters in New York City serves as a hub for its operations, with a dedicated team of experts working on improving the library, developing new models, and exploring the latest advancements in NLP. Hugging Face's impact on the NLP landscape is undeniable, and it continues to shape the future of language technology.

with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow.
"""
print("Context Length : ",len(long_context.split()))
print("Tokens Length : ",len(tokenizer.tokenize(long_context)))

Context Length :  608
Tokens Length :  868


In [108]:
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=long_context)

{'score': 0.9420106410980225,
 'start': 3998,
 'end': 4025,
 'answer': 'Jax, PyTorch and TensorFlow'}

In [109]:
print("Context Length : ",len(long_context))

Context Length :  4027


In [99]:
868 * 868

753424

In [52]:
result = model(**tokenizer(question, context,return_tensors='pt'))

In [53]:
result.start_logits

tensor([[-4.4952, -6.4454, -4.7115, -7.0968, -7.0726, -7.4981, -5.5397, -4.1368,
         -5.9199, -5.4193, -1.5920, -1.0857, -5.0981, -2.9331, -3.4070,  2.2467,
          5.1563, -1.3602, -2.2209, -0.9686, -4.8112, -2.2527,  1.4383, 10.1211,
         -1.5311,  2.2685, -1.8952, -2.2108, -4.2142, -2.5571, -2.3252, -2.6046,
          1.7047, -1.9867, -1.7211, -0.5415, -2.0239, -4.4246, -5.1012, -4.4966,
         -7.8940, -6.7200, -4.6759, -6.3279, -4.8339, -5.1839, -3.3724, -7.4120,
         -8.1542, -4.4871, -7.4659, -4.3293, -4.2293, -3.1903, -7.9467, -5.2665,
         -7.5902, -5.0570, -7.4476, -7.9083, -6.5951, -7.4061, -8.8821, -7.6749,
         -6.9879, -7.0466, -5.4193]], grad_fn=<CloneBackward0>)

512

In [130]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
question = "Which deep learning libraries back 🤗 Transformers?"
inputs = tokenizer(
    question,
    long_context,
    stride=128,
    max_length=model.config.max_position_embeddings,
    padding="longest",
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    return_tensors="pt"
)
_ = inputs.pop("overflow_to_sample_mapping")
offsets = inputs.pop("offset_mapping")
outputs = model(**inputs)

In [131]:
print(inputs.tokens())

['[CLS]', 'Which', 'deep', 'learning', 'libraries', 'back', '[UNK]', 'Transformers', '?', '[SEP]', '[UNK]', 'Transformers', ':', 'State', 'of', 'the', 'Art', 'NL', '##P', '[UNK]', 'Transformers', 'provides', 'thousands', 'of', 'pre', '##tra', '##ined', 'models', 'to', 'perform', 'tasks', 'on', 'texts', 'such', 'as', 'classification', ',', 'information', 'extraction', ',', 'question', 'answering', ',', 'sum', '##mar', '##ization', ',', 'translation', ',', 'text', 'generation', 'and', 'more', 'in', 'over', '100', 'languages', '.', 'Its', 'aim', 'is', 'to', 'make', 'cutting', '-', 'edge', 'NL', '##P', 'easier', 'to', 'use', 'for', 'everyone', '.', '[UNK]', 'Transformers', 'provides', 'API', '##s', 'to', 'quickly', 'download', 'and', 'use', 'those', 'pre', '##tra', '##ined', 'models', 'on', 'a', 'given', 'text', ',', 'fine', '-', 'tune', 'them', 'on', 'your', 'own', 'data', '##sets', 'and', 'then', 'share', 'them', 'with', 'the', 'community', 'on', 'our', 'model', 'hub', '.', 'At', 'the', 

In [132]:
print(inputs.sequence_ids())

[None, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [133]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

torch.Size([2, 512]) torch.Size([2, 512])


In [138]:
import torch

sequence_ids = inputs.sequence_ids()
# Mask everything apart from the tokens of the context
mask = [i != 1 for i in sequence_ids]
# Unmask the [CLS] token
mask[0] = False

In [140]:
mask = torch.logical_or(torch.tensor(mask)[None], (inputs["attention_mask"] == 0))

start_logits[mask] = -10000
end_logits[mask] = -10000

In [141]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]

In [142]:
scores = start_probabilities[:, None] * end_probabilities[None, :]

In [151]:
end_probabilities.shape

torch.Size([512])

In [144]:
scores = torch.triu(scores)

In [145]:
max_index = scores.argmax().item()
start_index = max_index // scores.shape[1]
end_index = max_index % scores.shape[1]

In [146]:
scores[start_index,end_index]

tensor(0.5692, grad_fn=<SelectBackward0>)

In [147]:
inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
offsets = inputs_with_offsets["offset_mapping"]

start_char, _ = offsets[start_index]
_, end_char = offsets[end_index]
answer = context[start_char:end_char]

In [149]:
offsets[start_index],offsets[end_index]

((19, 25), (44, 51))

In [154]:
inputs['input_ids'].shape

torch.Size([2, 512])

In [155]:
#####

In [167]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(type(tokenizer.backend_tokenizer))

<class 'tokenizers.Tokenizer'>


In [168]:
print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [169]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str('How are   you ?')

[('How', (0, 3)), ('are', (4, 7)), ('you', (10, 13)), ('?', (14, 15))]

In [170]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are   you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('Ġhow', (6, 10)),
 ('Ġare', (10, 14)),
 ('ĠĠ', (14, 16)),
 ('Ġyou', (16, 20)),
 ('?', (20, 21))]

In [171]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

[('▁Hello,', (0, 6)),
 ('▁how', (7, 10)),
 ('▁are', (11, 14)),
 ('▁you?', (16, 20))]

In [175]:
## Byte Pair Encoding Tokenizer - Implementing BPE

corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [176]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")