In [250]:
import os
import pandas as pd
import easyocr
import ssl
import requests
from PIL import Image
import re
import constants
from constants import unit_abbreviation_map

ssl._create_default_https_context = ssl._create_unverified_context


In [251]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

sample_train = train.iloc[:80]
sample_train


num = 78
img_link = sample_train['image_link'][num]
entity_name = sample_train['entity_name'][num] 
actual_pred = sample_train['entity_value'][num] 

img_link, entity_name, actual_pred

('https://m.media-amazon.com/images/I/718EdwGgyVL.jpg',
 'item_weight',
 '200.0 gram')

In [252]:
def ocr(img_link):
    reader = easyocr.Reader(['en'])
    get_from_net = requests.get(img_link, stream=True).raw
    image = Image.open(get_from_net)
    texts = reader.readtext(image)
    return texts

In [253]:
def clean1(texts):
    new_entries = []
    to_delete = []

    # Function to clean and normalize the input
    def clean_and_normalize(value):
        # Replace comma with period
        return value.replace(',', '.')

    # Loop over the list using enumerate to track index
    for i, text in enumerate(texts):
        # Check if '/' is in text[1]
        if '/' in text[1]:
            # Clean and normalize the text[1]
            cleaned_text = clean_and_normalize(text[1])
            # Split the cleaned text[1] by '/'
            todo = cleaned_text.split('/')
            
            # Add each part as a new tuple
            for k in todo:
                new_entries.append(
                    (text[0], k.strip(), text[-1])
                )
            
            # Mark the original entry for deletion
            to_delete.append(i)

    # Remove the original entries from texts in reverse order to avoid index issues
    for index in sorted(to_delete, reverse=True):
        del texts[index]

    # Extend the original texts list with the new entries
    texts.extend(new_entries)

    texts_dict = [{"text": text[1], "pred": text[-1]} for text in texts]
    texts_dict
    return texts_dict

In [254]:
def extract_and_replace_units(text, units_dict):
    # Create a regex pattern to match metrics and units
    pattern = r'(\d+\.?\d*)\s*(' + '|'.join(re.escape(key) for key in units_dict.keys()) + r')\b'
    
    # Find matches
    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    
    # Extract metric and unit
    extracted_units = [{'metric': match[0], 'unit': units_dict.get(match[1].lower(), match[1])} for match in matches]
    
    # Replace units in the text
    def replace_match(match):
        return f"{match.group(1)} {units_dict.get(match.group(2).lower(), match.group(2))}"
    
    updated_text = re.sub(pattern, replace_match, text, flags=re.IGNORECASE)
    
    return {'text': updated_text, 'extracted_units': extracted_units}


In [255]:
# Function to normalize units
def normalize_unit(unit):
    return unit_abbreviation_map[entity_name].get(unit.lower(), unit)

# Function to convert input to standardized form
def convert_to_standard_form(input_str):
    # Regex to capture number and unit (with or without space between them)
    match = re.match(r"([0-9.]+)\s*([a-zA-Z]+)", input_str)
    if match:
        value = match.group(1)
        unit = match.group(2)
        
        # Normalize unit
        normalized_unit = normalize_unit(unit)
        
        # Return standardized form: number + normalized unit
        return f"{value} {normalized_unit}"
    
    # If no match, return the input string unchanged
    return input_str

In [256]:
def is_unit_in_list(input_str, unit_list):
    # Regex to capture number and unit (ignoring the number here)
    match = re.match(r"([0-9.]+)\s*([a-zA-Z\s]+)", input_str)
    if match:
        unit = match.group(2).strip()
        # Normalize the unit
        normalized_unit = normalize_unit(unit)
        # Check if the normalized unit is in the provided list
        return normalized_unit in unit_list
    return False

In [257]:
def isin(string, words):
    pattern = '|'.join(re.escape(word) for word in words)
    if re.search(pattern, string):
        return True
    else:
        return False

In [258]:
def extract_and_format(text):
    # Define a regex pattern to match the numeric part and the unit
    pattern = r'(\d+\.?\d*)\s*([a-zA-Z]+)'
    
    # Find the match
    match = re.search(pattern, text)
    
    if match:
        # Extract numeric part and unit
        numeric_part = float(match.group(1))  # Convert to float
        unit = match.group(2)
        return numeric_part, unit
    else:
        return None, None

In [259]:
texts = ocr(img_link)
texts[:5]

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


[([[728, 266], [1278, 266], [1278, 370], [728, 370]],
  'Available in',
  0.5693073721273247),
 ([[390, 377], [1616, 377], [1616, 488], [390, 488]],
  'TWO OTHER DIFFERENT',
  0.8562295871174908),
 ([[717, 495], [1287, 495], [1287, 605], [717, 605]],
  'FLAVOURS',
  0.9922197129267067),
 ([[571, 922], [732, 922], [732, 978], [571, 978]],
  'Qi344p',
  0.0631006310430465),
 ([[980, 929], [1141, 929], [1141, 965], [980, 965]],
  'Quantum',
  0.9648586875162135)]

In [260]:
texts_dict = clean1(texts)
texts_dict[:5]

[{'text': 'Available in', 'pred': 0.5693073721273247},
 {'text': 'TWO OTHER DIFFERENT', 'pred': 0.8562295871174908},
 {'text': 'FLAVOURS', 'pred': 0.9922197129267067},
 {'text': 'Qi344p', 'pred': 0.0631006310430465},
 {'text': 'Quantum', 'pred': 0.9648586875162135}]

In [261]:
cons = list(constants.entity_unit_map[entity_name])
cons

['pound', 'milligram', 'kilogram', 'gram', 'ton', 'microgram', 'ounce']

In [262]:
texts_dict

[{'text': 'Available in', 'pred': 0.5693073721273247},
 {'text': 'TWO OTHER DIFFERENT', 'pred': 0.8562295871174908},
 {'text': 'FLAVOURS', 'pred': 0.9922197129267067},
 {'text': 'Qi344p', 'pred': 0.0631006310430465},
 {'text': 'Quantum', 'pred': 0.9648586875162135},
 {'text': 'Quantum', 'pred': 0.9818917890357461},
 {'text': 'Naturals', 'pred': 0.9777094463335578},
 {'text': 'Naturals', 'pred': 0.9779276779277533},
 {'text': 'OuantoCikill', 'pred': 0.16430114314629382},
 {'text': 'QuantcClilldex', 'pred': 0.24975247081860708},
 {'text': 'MILLET', 'pred': 0.998769187570498},
 {'text': 'MILLET SHORTBREAD', 'pred': 0.793868897657066},
 {'text': 'With Psyllium Husk and', 'pred': 0.7044825834464848},
 {'text': 'With Psyllium Husk -', 'pred': 0.4595989936851875},
 {'text': 'Husk', 'pred': 0.9793620941235592},
 {'text': 'Fennel', 'pred': 0.9999903152587154},
 {'text': 'Enhanced', 'pred': 0.9999210155138248},
 {'text': 'Date SyIup', 'pred': 0.28167090356289015},
 {'text': 'Enhanced_', 'pred': 

In [263]:
for sub_text in texts_dict:
    print(sub_text)

{'text': 'Available in', 'pred': 0.5693073721273247}
{'text': 'TWO OTHER DIFFERENT', 'pred': 0.8562295871174908}
{'text': 'FLAVOURS', 'pred': 0.9922197129267067}
{'text': 'Qi344p', 'pred': 0.0631006310430465}
{'text': 'Quantum', 'pred': 0.9648586875162135}
{'text': 'Quantum', 'pred': 0.9818917890357461}
{'text': 'Naturals', 'pred': 0.9777094463335578}
{'text': 'Naturals', 'pred': 0.9779276779277533}
{'text': 'OuantoCikill', 'pred': 0.16430114314629382}
{'text': 'QuantcClilldex', 'pred': 0.24975247081860708}
{'text': 'MILLET', 'pred': 0.998769187570498}
{'text': 'MILLET SHORTBREAD', 'pred': 0.793868897657066}
{'text': 'With Psyllium Husk and', 'pred': 0.7044825834464848}
{'text': 'With Psyllium Husk -', 'pred': 0.4595989936851875}
{'text': 'Husk', 'pred': 0.9793620941235592}
{'text': 'Fennel', 'pred': 0.9999903152587154}
{'text': 'Enhanced', 'pred': 0.9999210155138248}
{'text': 'Date SyIup', 'pred': 0.28167090356289015}
{'text': 'Enhanced_', 'pred': 0.9580738461329938}
{'text': 'Coco Bu

In [264]:
new = []

for sub_text in texts_dict:
    result = extract_and_replace_units(sub_text['text'], constants.unit_abbreviation_map[entity_name])
    result['pred'] = sub_text['pred']  # Include the prediction value
    for stuff in result['extracted_units']:
        pred = result['pred']
        if stuff['metric'].isdigit():
            all = str(float(int(stuff['metric']))) + ' ' + stuff['unit']
        else: all = stuff['metric'] + ' ' + stuff['unit']
        new.append({'text': all, 'pred': pred})
for n in new:
    if n != []:
        texts_dict.append(n)

In [265]:
texts_dict

[{'text': 'Available in', 'pred': 0.5693073721273247},
 {'text': 'TWO OTHER DIFFERENT', 'pred': 0.8562295871174908},
 {'text': 'FLAVOURS', 'pred': 0.9922197129267067},
 {'text': 'Qi344p', 'pred': 0.0631006310430465},
 {'text': 'Quantum', 'pred': 0.9648586875162135},
 {'text': 'Quantum', 'pred': 0.9818917890357461},
 {'text': 'Naturals', 'pred': 0.9777094463335578},
 {'text': 'Naturals', 'pred': 0.9779276779277533},
 {'text': 'OuantoCikill', 'pred': 0.16430114314629382},
 {'text': 'QuantcClilldex', 'pred': 0.24975247081860708},
 {'text': 'MILLET', 'pred': 0.998769187570498},
 {'text': 'MILLET SHORTBREAD', 'pred': 0.793868897657066},
 {'text': 'With Psyllium Husk and', 'pred': 0.7044825834464848},
 {'text': 'With Psyllium Husk -', 'pred': 0.4595989936851875},
 {'text': 'Husk', 'pred': 0.9793620941235592},
 {'text': 'Fennel', 'pred': 0.9999903152587154},
 {'text': 'Enhanced', 'pred': 0.9999210155138248},
 {'text': 'Date SyIup', 'pred': 0.28167090356289015},
 {'text': 'Enhanced_', 'pred': 

In [266]:
cons = list(constants.entity_unit_map[entity_name])
prob = []
for row in texts_dict:
    if row['text'].lower():
        normalised = convert_to_standard_form(row['text'])
        if isin(normalised, cons):
            print(normalised)
            prob.append([normalised, row['pred']])

if not prob:
    prediction = ""
    print(prediction)
elif len(prob) > 1:
    
    m_idx = 0
    max_pred = -1  

    for idx, item in enumerate(prob):
        if item[1] > max_pred:  
            m_idx = idx
            max_pred = item[1]  

    prediction = prob[m_idx][0]  
    print(prediction)
else:
    prediction = prob[0][0]  
    print(prediction)


Swcetoned
Swcetoned


In [267]:
prediction

'Swcetoned'

In [268]:
numeric_part, unit = extract_and_format(prediction)

# Combine numeric part with unit
formatted_text = f"{numeric_part} {unit}"

# Print results
print(f"Numeric Part: {numeric_part}")
print(f"Unit: {unit}")
print(f"Formatted Text: {formatted_text}")

print(f"\nFinal: {formatted_text}")

Numeric Part: None
Unit: None
Formatted Text: None None

Final: None None


In [269]:
formatted_text, actual_pred

('None None', '200.0 gram')

In [270]:
formatted_text == actual_pred

False