## inside the pipeline

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel

from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"


In [2]:
# Text to numbers

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized = tokenizer(['He will buy','he will not buy'],padding=True,truncation=True,return_tensors='pt')
tokenized

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'input_ids': tensor([[ 101, 2002, 2097, 4965,  102,    0],
        [ 101, 2002, 2097, 2025, 4965,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1]])}

In [3]:
# Last Hidden State

model = AutoModel.from_pretrained(checkpoint)
logists = model(tokenized['input_ids'],tokenized['attention_mask']).last_hidden_state
logists.shape

torch.Size([2, 6, 768])

In [5]:
model.save_pretrained('model')
model2 = AutoModel.from_pretrained('model')

In [116]:
# Classification Head

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
logits = model(tokenized['input_ids'],tokenized['attention_mask']).logits
logits,logits.shape

(tensor([[-3.5745,  3.7256],
         [ 4.1175, -3.3430]], grad_fn=<AddmmBackward0>),
 torch.Size([2, 2]))

## Scratch models with Config

In [25]:
from transformers import BertConfig, BertModel, AutoTokenizer

conf = BertConfig()
model = BertModel(conf)
model = BertModel.from_pretrained('bert-base-cased')

In [36]:
model.config.vocab_size

28996

In [44]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
token = tokenizer('France is [MASK]',return_tensors='pt')
model(token['input_ids']).last_hidden_state[0]


tensor([[ 0.3368,  0.0216,  0.0380,  ..., -0.1186,  0.5137,  0.0122],
        [ 0.4598, -0.2675,  0.4053,  ..., -0.3439,  0.5116,  0.2068],
        [ 0.1169, -0.5137,  0.3464,  ...,  0.0287,  0.2505,  0.3026],
        [ 0.1842, -0.4429,  0.1636,  ...,  0.1074,  0.3006, -0.2138],
        [ 0.6525, -0.3284, -0.8536,  ..., -0.2074,  1.5712, -0.2596]],
       grad_fn=<SelectBackward0>)

In [90]:
text = 'Hello Hamza'
tokens = tokenizer.tokenize(text)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

tokens = tokenizer.decode(ids)
print(tokens)

['Hello', 'Ham', '##za']
[8667, 13030, 3293]
Hello Hamza


In [96]:
text = ['Hello Ali', 'He is going to school']
text1 = 'Hello Ali'
text2  = 'He is going to school'
tokens = tokenizer(text,padding=True)
token1 = tokenizer(text1)
token2 = tokenizer(text2)

In [98]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
import torch
torch.tensor(token1['input_ids'])

tensor([ 101, 8667, 4149,  102])

In [105]:
model(torch.tensor([token1['input_ids']]))

SequenceClassifierOutput(loss=None, logits=tensor([[0.7100, 0.1312]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [106]:
model(torch.tensor([token2['input_ids']]))

SequenceClassifierOutput(loss=None, logits=tensor([[0.7300, 0.2339]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [109]:
model(torch.tensor(tokens['input_ids']),attention_mask=torch.tensor(tokens['attention_mask']))

SequenceClassifierOutput(loss=None, logits=tensor([[0.7100, 0.1312],
        [0.7300, 0.2339]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Additional Parameters pass to tokenizer

In [None]:
sequences = ['Hello hello','his name is Ali']


# Padding

# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)



# Truncation

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)