In [11]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import re
from transformers import CLIPProcessor, CLIPModel


In [12]:
# Load the pre-trained CLIP model and tokenizer
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [13]:
'''# Preprocess the image
image_path = 'receipt1.jpg'
image = Image.open(image_path)
inputs = processor(text=None, images=image, return_tensors="pt")
print(inputs)'''
# Preprocess the image
image_path = 'receipt.jpg'
image = Image.open(image_path)
image_input = processor(
    images=image,
    return_tensors="pt",
    padding=True,
    max_length=512,
    truncation=True
)

In [4]:
'''# Encode the image
with torch.no_grad():
    image_features = model.get_image_features(**inputs)'''
# Encode the image
with torch.no_grad():
    image_features = model.get_image_features(**image_input)

In [5]:
# Define the text prompt
prompt = "Total amount payable"

In [6]:
# Encode the prompt
with torch.no_grad():
    prompt_inputs = processor(text=prompt, return_tensors="pt")
    prompt_features = model.get_text_features(**prompt_inputs)


In [7]:
# Calculate the similarity scores between the image and the prompt
with torch.no_grad():
    similarity_scores = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
print(similarity_scores)

tensor([[1.]])


In [8]:
# Get the top matching tokens
tokens = []
for i in similarity_scores.argsort(descending=True):
    token = processor.tokenizer.decode(i).strip()
    if len(token) > 0:
        tokens.append(token)
    if len(tokens) >= 5:
        break
print(tokens)

['!']


In [9]:
# Extract the total cost from the tokens
for token in tokens:
    match = re.match(r'\d+\.\d+', token)
    if match is not None:
        total_cost = float(match.group())
        break
else:
    # Define total_cost if no match is found
    total_cost = 0.0

In [10]:
print(f'Total cost: {total_cost}')

Total cost: 0.0
