In [272]:
import os
import pandas as pd
import easyocr
import ssl
import requests
from PIL import Image
import re
import constants
from constants import unit_abbreviation_map

ssl._create_default_https_context = ssl._create_unverified_context


In [273]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

sample_train = train.iloc[:80]
sample_train


num = 48
img_link = sample_train['image_link'][num]
entity_name = sample_train['entity_name'][num] 
actual_pred = sample_train['entity_value'][num] 

img_link, entity_name, actual_pred

('https://m.media-amazon.com/images/I/817vo3DcCNL.jpg',
 'wattage',
 '250.0 watt')

In [274]:
def ocr(img_link):
    reader = easyocr.Reader(['en'])
    get_from_net = requests.get(img_link, stream=True).raw
    image = Image.open(get_from_net)
    texts = reader.readtext(image)
    return texts

In [275]:
def clean1(texts):
    new_entries = []
    to_delete = []

    # Function to clean and normalize the input
    def clean_and_normalize(value):
        # Replace comma with period
        return value.replace(',', '.')

    # Loop over the list using enumerate to track index
    for i, text in enumerate(texts):
        # Check if '/' is in text[1]
        if '/' in text[1]:
            # Clean and normalize the text[1]
            cleaned_text = clean_and_normalize(text[1])
            # Split the cleaned text[1] by '/'
            todo = cleaned_text.split('/')
            
            # Add each part as a new tuple
            for k in todo:
                new_entries.append(
                    (text[0], k.strip(), text[-1])
                )
            
            # Mark the original entry for deletion
            to_delete.append(i)

    # Remove the original entries from texts in reverse order to avoid index issues
    for index in sorted(to_delete, reverse=True):
        del texts[index]

    # Extend the original texts list with the new entries
    texts.extend(new_entries)

    texts_dict = [{"text": text[1], "pred": text[-1]} for text in texts]
    texts_dict
    return texts_dict

In [276]:
def extract_and_replace_units(text, units_dict):
    # Create a regex pattern to match metrics and units
    pattern = r'(\d+\.?\d*)\s*(' + '|'.join(re.escape(key) for key in units_dict.keys()) + r')\b'
    
    # Find matches
    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    
    # Extract metric and unit
    extracted_units = [{'metric': match[0], 'unit': units_dict.get(match[1].lower(), match[1])} for match in matches]
    
    # Replace units in the text
    def replace_match(match):
        return f"{match.group(1)} {units_dict.get(match.group(2).lower(), match.group(2))}"
    
    updated_text = re.sub(pattern, replace_match, text, flags=re.IGNORECASE)
    
    return {'text': updated_text, 'extracted_units': extracted_units}


In [277]:
# Function to normalize units
def normalize_unit(unit, entity_name):
    return unit_abbreviation_map[entity_name].get(unit.lower(), unit)

# Function to convert input to standardized form
def convert_to_standard_form(input_str, ):
    # Regex to capture number and unit (with or without space between them)
    match = re.match(r"([0-9.]+)\s*([a-zA-Z]+)", input_str)
    if match:
        value = match.group(1)
        unit = match.group(2)
        
        # Normalize unit
        normalized_unit = normalize_unit(unit, entity_name)
        
        # Return standardized form: number + normalized unit
        return f"{value} {normalized_unit}"
    
    # If no match, return the input string unchanged
    return input_str

In [278]:
def is_unit_in_list(input_str, unit_list):
    # Regex to capture number and unit (ignoring the number here)
    match = re.match(r"([0-9.]+)\s*([a-zA-Z\s]+)", input_str)
    if match:
        unit = match.group(2).strip()
        # Normalize the unit
        normalized_unit = normalize_unit(unit)
        # Check if the normalized unit is in the provided list
        return normalized_unit in unit_list
    return False

In [279]:
def isin(string, words):
    pattern = '|'.join(re.escape(word) for word in words)
    if re.search(pattern, string):
        return True
    else:
        return False

In [280]:
def extract_and_format(text):
    # Define a regex pattern to match the numeric part and the unit
    pattern = r'(\d+\.?\d*)\s*([a-zA-Z]+)'
    
    # Find the match
    match = re.search(pattern, text)
    
    if match:
        # Extract numeric part and unit
        numeric_part = float(match.group(1))  # Convert to float
        unit = match.group(2)
        return numeric_part, unit
    else:
        return None, None

In [281]:
texts = ocr(img_link)
texts[:5]

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


[([[107, 183], [2336, 183], [2336, 393], [107, 393]],
  'KOMFORT-PAKET 4a',
  0.35431388707142747),
 ([[2140, 1676], [2160, 1676], [2160, 1755], [2140, 1755]],
  '1',
  0.7760143631954044),
 ([[842, 1756], [1742, 1756], [1742, 1878], [842, 1878]],
  'DAS HERZSTUCK :',
  0.9928206878891216),
 ([[115, 1897], [423, 1897], [423, 1963], [115, 1963]],
  'STARKER',
  0.9999500315210897),
 ([[442, 1888], [2454, 1888], [2454, 1964], [442, 1964]],
  'UND EFFIZIENTER MOTOR FiR ANGENEHMEN FAHRKOMFORT',
  0.7560478897506805)]

In [282]:
texts_dict = clean1(texts)
texts_dict[:5]

[{'text': 'KOMFORT-PAKET 4a', 'pred': 0.35431388707142747},
 {'text': '1', 'pred': 0.7760143631954044},
 {'text': 'DAS HERZSTUCK :', 'pred': 0.9928206878891216},
 {'text': 'STARKER', 'pred': 0.9999500315210897},
 {'text': 'UND EFFIZIENTER MOTOR FiR ANGENEHMEN FAHRKOMFORT',
  'pred': 0.7560478897506805}]

In [283]:
cons = list(constants.entity_unit_map[entity_name])
cons

['kilowatt', 'watt']

In [284]:
texts_dict

[{'text': 'KOMFORT-PAKET 4a', 'pred': 0.35431388707142747},
 {'text': '1', 'pred': 0.7760143631954044},
 {'text': 'DAS HERZSTUCK :', 'pred': 0.9928206878891216},
 {'text': 'STARKER', 'pred': 0.9999500315210897},
 {'text': 'UND EFFIZIENTER MOTOR FiR ANGENEHMEN FAHRKOMFORT',
  'pred': 0.7560478897506805},
 {'text': 'KRAFTVOLLE UND EFFIZIENTE', 'pred': 0.8767702961318427},
 {'text': 'EXTREM LEISER UND', 'pred': 0.9145094799818458},
 {'text': 'UNGLAUBLICHES DREHMOMENT', 'pred': 0.999008231371281},
 {'text': 'UNTERSTUTZUNG BIS', 'pred': 0.9994943754635573},
 {'text': 'LEISTUNGSSTARKER MOTOR MIT', 'pred': 0.9499376126418606},
 {'text': 'VON', 'pred': 0.9997288323954827},
 {'text': '250 W', 'pred': 0.9917589681491749},
 {'text': '45 NM', 'pred': 0.9984150717870629},
 {'text': '0', 'pred': 0.26312357600830083},
 {'text': '25 KM', 'pred': 0.669536166297791},
 {'text': 'H', 'pred': 0.669536166297791}]

In [285]:
for sub_text in texts_dict:
    print(sub_text)

{'text': 'KOMFORT-PAKET 4a', 'pred': 0.35431388707142747}
{'text': '1', 'pred': 0.7760143631954044}
{'text': 'DAS HERZSTUCK :', 'pred': 0.9928206878891216}
{'text': 'STARKER', 'pred': 0.9999500315210897}
{'text': 'UND EFFIZIENTER MOTOR FiR ANGENEHMEN FAHRKOMFORT', 'pred': 0.7560478897506805}
{'text': 'KRAFTVOLLE UND EFFIZIENTE', 'pred': 0.8767702961318427}
{'text': 'EXTREM LEISER UND', 'pred': 0.9145094799818458}
{'text': 'UNGLAUBLICHES DREHMOMENT', 'pred': 0.999008231371281}
{'text': 'UNTERSTUTZUNG BIS', 'pred': 0.9994943754635573}
{'text': 'LEISTUNGSSTARKER MOTOR MIT', 'pred': 0.9499376126418606}
{'text': 'VON', 'pred': 0.9997288323954827}
{'text': '250 W', 'pred': 0.9917589681491749}
{'text': '45 NM', 'pred': 0.9984150717870629}
{'text': '0', 'pred': 0.26312357600830083}
{'text': '25 KM', 'pred': 0.669536166297791}
{'text': 'H', 'pred': 0.669536166297791}


In [286]:
new = []

for sub_text in texts_dict:
    result = extract_and_replace_units(sub_text['text'], constants.unit_abbreviation_map[entity_name])
    result['pred'] = sub_text['pred']  # Include the prediction value
    for stuff in result['extracted_units']:
        pred = result['pred']
        if stuff['metric'].isdigit():
            all = str(float(int(stuff['metric']))) + ' ' + stuff['unit']
        else: all = stuff['metric'] + ' ' + stuff['unit']
        new.append({'text': all, 'pred': pred})
for n in new:
    if n != []:
        texts_dict.append(n)

In [287]:
texts_dict

[{'text': 'KOMFORT-PAKET 4a', 'pred': 0.35431388707142747},
 {'text': '1', 'pred': 0.7760143631954044},
 {'text': 'DAS HERZSTUCK :', 'pred': 0.9928206878891216},
 {'text': 'STARKER', 'pred': 0.9999500315210897},
 {'text': 'UND EFFIZIENTER MOTOR FiR ANGENEHMEN FAHRKOMFORT',
  'pred': 0.7560478897506805},
 {'text': 'KRAFTVOLLE UND EFFIZIENTE', 'pred': 0.8767702961318427},
 {'text': 'EXTREM LEISER UND', 'pred': 0.9145094799818458},
 {'text': 'UNGLAUBLICHES DREHMOMENT', 'pred': 0.999008231371281},
 {'text': 'UNTERSTUTZUNG BIS', 'pred': 0.9994943754635573},
 {'text': 'LEISTUNGSSTARKER MOTOR MIT', 'pred': 0.9499376126418606},
 {'text': 'VON', 'pred': 0.9997288323954827},
 {'text': '250 W', 'pred': 0.9917589681491749},
 {'text': '45 NM', 'pred': 0.9984150717870629},
 {'text': '0', 'pred': 0.26312357600830083},
 {'text': '25 KM', 'pred': 0.669536166297791},
 {'text': 'H', 'pred': 0.669536166297791},
 {'text': '250.0 watt', 'pred': 0.9917589681491749}]

In [288]:
cons = list(constants.entity_unit_map[entity_name])
prob = []
for row in texts_dict:
    if row['text'].lower():
        normalised = convert_to_standard_form(row['text'], entity_name)
        if isin(normalised, cons):
            print(normalised)
            prob.append([normalised, row['pred']])

if not prob:
    prediction = ""
    print(prediction)
elif len(prob) > 1:
    
    m_idx = 0
    max_pred = -1  

    for idx, item in enumerate(prob):
        if item[1] > max_pred:  
            m_idx = idx
            max_pred = item[1]  

    prediction = prob[m_idx][0]  
    print(prediction)
else:
    prediction = prob[0][0]  
    print(prediction)


250 watt
250.0 watt
250 watt


In [289]:
prediction

'250 watt'

In [290]:
numeric_part, unit = extract_and_format(prediction)

# Combine numeric part with unit
formatted_text = f"{numeric_part} {unit}"

# Print results
print(f"Numeric Part: {numeric_part}")
print(f"Unit: {unit}")
print(f"Formatted Text: {formatted_text}")

print(f"\nFinal: {formatted_text}")

Numeric Part: 250.0
Unit: watt
Formatted Text: 250.0 watt

Final: 250.0 watt


In [291]:
formatted_text, actual_pred

('250.0 watt', '250.0 watt')

In [292]:
formatted_text == actual_pred

True