# Notebook for running predictions on text data

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [14]:
# load the model and tokenizer
model_name = model_checkpoint = "../../Models/ROB_0.89F1_16B_100000DAT"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [15]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT']

In [16]:

# takes a splited sentance and returns the labels
def predict_labels(text, model, tokenizer, label_list, max_length=512):

    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, is_split_into_words=True)
    word_ids = inputs.word_ids()
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    predictions = [label_list[prediction] for prediction in predictions[0]]
    tokenized_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    labels = ['O'] * len(text)
    
    for idx, (token, prediction) in enumerate(zip(tokenized_tokens, predictions)):
        original_token_index = word_ids[idx]
        if original_token_index is not None:
            labels[original_token_index] = prediction
        
    return labels
    

In [23]:
text = """[URL] <NO_URL> [URL] [TITLE] <NO_TITLE> [TITLE]
[TEXT]  
Calgary
Calgary storage with drawer
Choose your design


Material
matt ash grey lacquered

Change


Leg
matt ash grey structure lacquered

Change

Material
matt ash grey lacquered

Leg
matt ash grey structure lacquered

Rec. retail price

$2,819.00

From $2,819.00

Add to cart
Expected delivery

Contact store

See deliver

 [TEXT] """

labels = predict_labels(text.split(), model, tokenizer, label_list)

for token, label in zip(text.split(), labels):
    print(f"{token:10}: {label}")

[URL]     : O
<NO_URL>  : O
[URL]     : O
[TITLE]   : O
<NO_TITLE>: O
[TITLE]   : O
[TEXT]    : O
Calgary   : O
Calgary   : O
storage   : O
with      : O
drawer    : O
Choose    : O
your      : O
design    : O
Material  : B-PRODUCT
matt      : I-PRODUCT
ash       : I-PRODUCT
grey      : I-PRODUCT
lacquered : I-PRODUCT
Change    : O
Leg       : B-PRODUCT
matt      : O
ash       : I-PRODUCT
grey      : I-PRODUCT
structure : I-PRODUCT
lacquered : I-PRODUCT
Change    : B-PRODUCT
Material  : O
matt      : I-PRODUCT
ash       : I-PRODUCT
grey      : I-PRODUCT
lacquered : I-PRODUCT
Leg       : O
matt      : O
ash       : O
grey      : O
structure : O
lacquered : O
Rec.      : O
retail    : O
price     : O
$2,819.00 : O
From      : O
$2,819.00 : O
Add       : O
to        : O
cart      : O
Expected  : O
delivery  : O
Contact   : O
store     : O
See       : O
deliver   : O
[TEXT]    : O


In [24]:


inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)

# Get the logits (raw prediction scores)
logits = outputs.logits

# Apply softmax to get the probabilities
probabilities = torch.softmax(logits, dim=2)

# Convert token IDs to words
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
word_ids = inputs.word_ids()


# Get probabilities for O, B-PRODUCT, and I-PRODUCT
predicted_labels = []
o_tag_probs = []
b_product_probs = []
i_product_probs = []

for i, token_probs in enumerate(probabilities[0]):
    o_tag_prob = token_probs[label_list.index('O')].item()
    b_product_prob = token_probs[label_list.index('B-PRODUCT')].item()
    i_product_prob = token_probs[label_list.index('I-PRODUCT')].item()

    # Get the predicted label (max probability)
    max_prob, predicted_idx = torch.max(token_probs, dim=0)  # Get the max probability and its index
    predicted_label = label_list[predicted_idx]

    predicted_labels.append(predicted_label)
    o_tag_probs.append(o_tag_prob)
    b_product_probs.append(b_product_prob)
    i_product_probs.append(i_product_prob)


for token, label, o_prob, b_prob, i_prob in zip(tokens, predicted_labels, o_tag_probs, b_product_probs, i_product_probs):
    print(f"{token:10}: {label:10} O: {o_prob:.2f} B-PRODUCT: {b_prob:.2f} I-PRODUCT: {i_prob:.2f}")

<s>       : O          O: 0.98 B-PRODUCT: 0.01 I-PRODUCT: 0.01
[URL]     : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
<NO_URL>  : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[URL]     : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[TITLE]   : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
<NO_TITLE>: O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[TITLE]   : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ċ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUCT: 0.00
[TEXT]    : O          O: 0.99 B-PRODUCT: 0.00 I-PRODUCT: 0.00
Ġ         : O          O: 1.00 B-PRODUCT: 0.00 I-PRODUC