# Notebook for running predictions on text data

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [4]:
# Load the tokenizer and model
model_name = model_checkpoint = "distilbert-base-uncased-for-product-extraction/full_text_strictly_labeled_86000_0.87"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [5]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT']

In [6]:

# Takes a splited sentance and returns the labels
def predict_labels(text, model, tokenizer, label_list, max_length=512):

    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, is_split_into_words=True)
    word_ids = inputs.word_ids()
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    predictions = [label_list[prediction] for prediction in predictions[0]]
    tokenized_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    labels = ['O'] * len(text)
    
    for idx, (token, prediction) in enumerate(zip(tokenized_tokens, predictions)):
        original_token_index = word_ids[idx]
        if original_token_index is not None:
            labels[original_token_index] = prediction
        
    return labels
    

In [12]:
text = """[URL] <NO_URL> [URL] [TITLE] <NO_TITLE> [TITLE]
[TEXT]  [TEXT]
 [TEXT] """

labels = predict_labels(text.split(), model, tokenizer, label_list)

for token, label in zip(text.split(), labels):
    print(f"{token:10}: {label}")

[URL]     : O
<NO_URL>  : O
[URL]     : O
[TITLE]   : O
<NO_TITLE>: O
[TITLE]   : O
[TEXT]    : O
Gaming    : B-PRODUCT
desk      : I-PRODUCT
with      : O
a         : O
lot       : O
of        : O
space     : O
for       : O
your      : O
gaming    : O
setup.    : O
The       : O
desk      : O
is        : O
made      : O
of        : O
high      : O
quality   : O
materials : O
and       : O
has       : O
a         : O
modern    : O
design.   : O
The       : O
desk      : O
is        : O
perfect   : O
for       : O
gamers    : O
who       : O
need      : O
a         : O
lot       : O
of        : O
space     : O
for       : O
their     : O
gaming    : O
setup.    : O
The       : O
desk      : O
is        : O
easy      : O
to        : O
assemble  : O
and       : O
comes     : O
with      : O
all       : O
the       : O
necessary : O
tools.    : O
The       : O
desk      : O
is        : O
also      : O
very      : O
durable   : O
and       : O
will      : O
last      : O
for       : O
a   

In [7]:


# inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
# 
# # Get model predictions
# with torch.no_grad():
#     outputs = model(**inputs)
# 
# # Get the logits (raw prediction scores)
# logits = outputs.logits
# 
# # Apply softmax to get the probabilities
# probabilities = torch.softmax(logits, dim=2)
# 
# # Convert token IDs to words
# tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# word_ids = inputs.word_ids()
# 
# 
# # Get probabilities for O, B-PRODUCT, and I-PRODUCT
# predicted_labels = []
# o_tag_probs = []
# b_product_probs = []
# i_product_probs = []
# 
# for i, token_probs in enumerate(probabilities[0]):
#     o_tag_prob = token_probs[label_list.index('O')].item()
#     b_product_prob = token_probs[label_list.index('B-PRODUCT')].item()
#     i_product_prob = token_probs[label_list.index('I-PRODUCT')].item()
#     
#     # Get the predicted label (max probability)
#     max_prob, predicted_idx = torch.max(token_probs, dim=0)  # Get the max probability and its index
#     predicted_label = label_list[predicted_idx]
# 
#     predicted_labels.append(predicted_label)
#     o_tag_probs.append(o_tag_prob)
#     b_product_probs.append(b_product_prob)
#     i_product_probs.append(i_product_prob)
#     
# 
# for token, label, o_prob, b_prob, i_prob in zip(tokens, predicted_labels, o_tag_probs, b_product_probs, i_product_probs):
#     print(f"{token:10}: {label:10} O: {o_prob:.2f} B-PRODUCT: {b_prob:.2f} I-PRODUCT: {i_prob:.2f}")

[URL]     : O
<NO_URL>  : O
[URL]     : O
[TITLE]   : O
[TITLE]   : O
[TEXT]    : O
The       : O
Best      : O
Sleeper   : O
Sofas     : O
for       : O
Small     : O
Spaces    : O
[TEXT]    : O
