# Notebook for running predictions on text data

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [4]:
# load the model and tokenizer
model_name = model_checkpoint = "../../Models/ROB_0.89F1_16B_100000DAT"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [5]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT']

In [7]:

# takes a splited sentance and returns the labels
def predict_labels(text, model, tokenizer, label_list, max_length=512):

    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, is_split_into_words=True)
    word_ids = inputs.word_ids()
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    predictions = [label_list[prediction] for prediction in predictions[0]]
    tokenized_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    labels = ['O'] * len(text)
    
    for idx, (token, prediction) in enumerate(zip(tokenized_tokens, predictions)):
        original_token_index = word_ids[idx]
        if original_token_index is not None:
            labels[original_token_index] = prediction
        
    return labels
    

In [10]:
text = """[URL] <NO_URL> [URL] [TITLE] <NO_TITLE> [TITLE]
[TEXT]  
Soft sofa for Sleeping and Lounging, Price  $ 100.00, 
 [TEXT] """

labels = predict_labels(text.split(), model, tokenizer, label_list)

for token, label in zip(text.split(), labels):
    print(f"{token:10}: {label}")

[URL]     : O
<NO_URL>  : O
[URL]     : O
[TITLE]   : O
<NO_TITLE>: O
[TITLE]   : O
[TEXT]    : O
Soft      : O
sofa      : O
for       : O
Sleeping  : O
and       : O
Lounging, : O
Price     : O
$         : O
100.00,   : O
[TEXT]    : O


In [11]:


inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

# model prediction
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits

probabilities = torch.softmax(logits, dim=2)

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
word_ids = inputs.word_ids()


predicted_labels = []
o_tag_probs = []
b_product_probs = []
i_product_probs = []

for i, token_probs in enumerate(probabilities[0]):
    o_tag_prob = token_probs[label_list.index('O')].item()
    b_product_prob = token_probs[label_list.index('B-PRODUCT')].item()
    i_product_prob = token_probs[label_list.index('I-PRODUCT')].item()

    max_prob, predicted_idx = torch.max(token_probs, dim=0)  #
    predicted_label = label_list[predicted_idx]

    predicted_labels.append(predicted_label)
    o_tag_probs.append(o_tag_prob)
    b_product_probs.append(b_product_prob)
    i_product_probs.append(i_product_prob)


for token, label, o_prob, b_prob, i_prob in zip(tokens, predicted_labels, o_tag_probs, b_product_probs, i_product_probs):
    print(f"{token:10}: {label:10} O: {o_prob:.2f} B-PRODUCT: {b_prob:.2f} I-PRODUCT: {i_prob:.2f}")

<s>       : O          O: 0.83 B-PRODUCT: 0.06 I-PRODUCT: 0.11
[URL]     : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
<NO_URL>  : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[URL]     : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[TITLE]   : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
<NO_TITLE>: O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[TITLE]   : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ċ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[TEXT]    : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUC