# What is fed to model? <br>
We do not just fed model with sequence of token ids, but put it in a batch container. <br>
sequence: tensor([2023,  3185, 19237,   999]) <br>
batch container: tensor([[2023,  3185, 19237,   999]]) <br>

In [77]:
# the codes will run into error at line 13, because the input tensor is 1D, only a sequence token, not a batch which is 2D

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = "allevelly/Movie_Review_Sentiment_Analysis"
raw_text1 = "This movie is scary!"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
token = tokenizer.tokenize(raw_text1)
token_ids = tokenizer.convert_tokens_to_ids(token)   # one dimension list
input_ids = torch.tensor(token_ids)                  # one dimension tensor
print(token_ids, input_ids, sep='\n')
model(input_ids)      # error, because we only fed the model with sequence which is in 1D tensor, instead of 2D batch

#==============================================
# Batching is to put multiple sentences in one container, and pass through the model altogether all at once.
# Even if you only have one sentence, you still need to put it in a batch container.
#==============================================

[2023, 3185, 2003, 12459, 999]
tensor([ 2023,  3185,  2003, 12459,   999])


IndexError: too many indices for tensor of dimension 1

In [75]:
# correct the error by adding one more dimention to the input tesnor
input_ids = torch.tensor([token_ids])   # we add one more dimension to token_ids by put it in a list
print(input_ids)                        # input_ids is becoming 2D tensor, which is a batch.
model(input_ids).logits

tensor([[ 2023,  3185,  2003, 12459,   999]])


tensor([[-1.6172,  1.9445]], grad_fn=<AddmmBackward0>)

# Attnesion Mask


In [72]:
# comparing the results with attention mask and without.

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = "allevelly/Movie_Review_Sentiment_Analysis"
raw_text1 = "This movie is scary! Great!"
raw_text2 = "This movie is scary!"
raw_text_batch = [raw_text1, raw_text2]

token_text1 = ["[CLS]"] + tokenizer.tokenize(raw_text1) + ["[SEP]"]  # add special token to match tokenizer()
token_ids_text1 = tokenizer.convert_tokens_to_ids(token_text1)
input_ids_text1 = torch.tensor([token_ids_text1])

token_text2 = ["[CLS]"] + tokenizer.tokenize(raw_text2) + ["[SEP]"]  # add special token to match tokenizer()
token_ids_text2 = tokenizer.convert_tokens_to_ids(token_text2)
input_ids_text2 = torch.tensor([token_ids_text2])

output = tokenizer(raw_text_batch, padding=True)
input_ids_batch = torch.tensor(output.input_ids)
print(tokenizer.convert_ids_to_tokens(input_ids_batch[0]))

print ('---------------input tensors----------------')
print (input_ids_text1, input_ids_text2, input_ids_batch, sep='\n')

print ('---------------model logits----------------')
print(model(input_ids_text1).logits)
print(model(input_ids_text2).logits)
print(model(input_ids_batch).logits)

['[CLS]', 'this', 'movie', 'is', 'scary', '!', 'great', '!', '[SEP]']
---------------input tensors----------------
tensor([[  101,  2023,  3185,  2003, 12459,   999,  2307,   999,   102]])
tensor([[  101,  2023,  3185,  2003, 12459,   999,   102]])
tensor([[  101,  2023,  3185,  2003, 12459,   999,  2307,   999,   102],
        [  101,  2023,  3185,  2003, 12459,   999,   102,     0,     0]])
---------------model logits----------------
tensor([[-2.3800,  2.7929]], grad_fn=<AddmmBackward0>)
tensor([[-1.7340,  2.0677]], grad_fn=<AddmmBackward0>)
tensor([[-2.3800,  2.7929],
        [-1.5437,  1.8513]], grad_fn=<AddmmBackward0>)


From the above printing output, we conclude:
1. raw_text1 has the same classfication result, as the input tensors are the same in individual and in batch.
2. raw_text2 has different classfication results, as the input tensor in batch has more zeros comparing to in individual.
The zeros are the padding, to screen out the padding, we need attention mask.

In [73]:
# add attention ask to batch to make raw_text2 model output the same.
attention_mask = [list(0 if ele.item()==0 else 1 for ele in input_ids) for input_ids in input_ids_batch]
print(attention_mask)

print(model(input_ids_batch,
            attention_mask=torch.tensor(attention_mask)).logits)

[[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0]]
tensor([[-2.3800,  2.7929],
        [-1.7340,  2.0677]], grad_fn=<AddmmBackward0>)
