# **LIBRARIES**

In [None]:
pip install pytesseract

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [None]:
import re
import os
import cv2
import numpy as np
import pytesseract
from PIL import Image
import csv
import pandas as pd

# **DATASET**

In [None]:
# Load the dataset into a pandas DataFrame
df = pd.read_csv('Downloads/amazon_ml/student_resource 3/dataset/test.csv')  # replace 'your_dataset.csv' with your actual file name

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131187 entries, 0 to 131186
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   index        131187 non-null  int64 
 1   image_link   131187 non-null  object
 2   group_id     131187 non-null  int64 
 3   entity_name  131187 non-null  object
dtypes: int64(2), object(2)
memory usage: 4.0+ MB


In [None]:
num_list = df['index'].tolist()

# **FOLDERS**

In [None]:
preprocessed_image_folder = "Downloads/amazon_test_preprocessed_images/amazon_test_preprocessed_images"

In [None]:
# Set path to Tesseract executable if required (adjust for your OS)
pytesseract.pytesseract.tesseract_cmd = r'Downloads\Tesseract-OCR\tesseract.exe'

#**UNITS AND ENTITIES**

In [None]:
# Provided entity_unit_map
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt','W'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon',
                    'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

In [None]:
# Dictionary to map common abbreviations to full unit names
unit_abbreviation_map = {
    'w': 'watt',
    'W' : 'watt',
    'kw': 'kilowatt',
    'v': 'volt',
    'kv': 'kilovolt',
    'cm': 'centimetre',
    'mm': 'millimetre',
    'm': 'metre',
    'inch': 'inch',
    'g': 'gram',
    'kg': 'kilogram',
    'mg': 'milligram',
    'oz': 'ounce',
    'lb': 'pound',
    'Ib' : 'pound',
    'ft': 'foot',
    'yd': 'yard',
    # Add more abbreviations as needed
}

In [None]:
# Define the entity preference order (highest to lowest)
preference_order = [
    'wattage',
    'voltage',
    'item_volume',
    'maximum_weight_recommendation',
    'item_weight',
    'height',
    'width',
    'depth'
]

In [None]:
# Create a set of allowed units
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# **DICTIONARIES**

In [None]:
index_entity_map = {}

# Create a dictionary where key is index and value is entity_name
for index, row in df.iterrows():
  index_entity_map[row['index']] = row['entity_name']

In [None]:
index_entity_map_keys = []

for k in index_entity_map.keys():
  index_entity_map_keys.append(k)

In [None]:
# Extract text from each image
extracted_data = {}

In [None]:
# Dictionary to hold index - parsed texts pairs
parsed_results = {}

In [None]:
# Initialize a list to hold the final CSV output rows
output_data_rows = []

In [None]:
# List all image files
image_files = [f for f in os.listdir(preprocessed_image_folder) if f.endswith(".jpg")]

# Sort image files by numerical index
# We assume the format is "{index}.jpg"
image_files.sort(key=lambda x: int(x.split('.')[0]))

In [None]:
extracted_data_file = 'extracted_data_file.csv'

# **FUNCTIONS**

In [None]:
def save_extracted_data_to_csv(extracted_data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['index', 'extracted_text'])
        # Write the dictionary content
        for index, text in extracted_data.items():
            writer.writerow([index, text])

In [None]:
def clean_text(text):
    # Step 1: Convert text to lowercase for uniformity
    text = text.lower()

    # Step 2: Replace common separators (/, \n, etc.) with spaces
    text = re.sub(r'[\n\r/]', ' ', text)

    # Step 3: Normalize spacing around numbers and units (e.g., '66watt' -> '66 watt')
    text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', text)

    text = re.sub(r'(?<=\d),(?=\d)', '.', text)

    # Step 4: Remove non-alphanumeric characters except for decimal points and units (keep dots for numbers)
    text = re.sub(r'[^a-zA-Z0-9.\s-]', '', text)

    # Step 5: Detect ranges and standardize (e.g., '208-240v' -> '[208.0, 240.0] volt')
    range_pattern = re.compile(r'(\d+)\s*[-to]+\s*(\d+)\s*([a-zA-Z]+)')
    text = range_pattern.sub(lambda m: f"[{float(m.group(1))}, {float(m.group(2))}] {m.group(3)}", text)

    # Step 6: Handle abbreviations and units (e.g., "w" -> "watt", "g" -> "gram")
    text = re.sub(r'\bkg\b', ' kilogram', text)
    text = re.sub(r'\bg\b', ' gram', text)
    text = re.sub(r'\bm\b', ' metre', text)
    text = re.sub(r'\bcm\b', ' centimetre', text)
    text = re.sub(r'\bmm\b', ' millimetre', text)
    text = re.sub(r'\bin\b', ' inch', text)
    text = re.sub(r'\bft\b', ' foot', text)
    text = re.sub(r'\byard\b', ' yard', text)
    text = re.sub(r'\bw\b', ' watt', text)
    text = re.sub(r'\bkw\b', ' kilowatt', text)
    text = re.sub(r'\bv\b', ' volt', text)
    text = re.sub(r'\bmg\b', ' milligram', text)
    text = re.sub(r'\b''\b', ' inch', text)
    text = re.sub(r'\b"\b', ' inch', text)
    text = re.sub(r'\blbs\b', ' pound', text)
    text = re.sub(r'\b0m\b', ' centimetre', text)
    text = re.sub(r'\bhz\b', ' hertz', text)
    text = re.sub(r'\boz\b', ' ounce', text)

    # Step 7: Replace multiple spaces with a single space (standardize spacing)
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 8: Additional cleaning, if needed (e.g., remove small noisy characters or irrelevant words)
    # You can add more patterns here if necessary for your dataset

    return text

In [None]:
def standardize_units(text):
    # Replace abbreviations with full unit names
    for abbreviation, full_unit in unit_abbreviation_map.items():
        # Ensure abbreviation followed by either end of word or non-alpha character
        text = re.sub(rf'\b{abbreviation}\b', full_unit, text)

    return text

In [None]:
def parse_entity_value(text, entity_name):
    allowed_units = entity_unit_map.get(entity_name, set())

    # Create regex pattern to match valid numbers and units, allowing for compact forms
    pattern = r'(\d+(\.\d+)?\s?({}))'.format('|'.join(allowed_units))

    # Search for the pattern in the extracted and cleaned text
    match = re.search(pattern, text.lower())

    # Create a regex pattern to match ranges in the format [start, end] unit or single values
    pattern = r'(\[\d+(\.\d+)?,\s?\d+(\.\d+)?\]\s?({}))|(\d+(\.\d+)?\s?({}))'.format('|'.join(allowed_units), '|'.join(allowed_units))

    # Search for the pattern in the extracted and cleaned text
    match = re.search(pattern, text.lower())

    if match:
        if match.group(1):  # If it's a range (e.g., "[208.0, 240.0] volt")
            return match.group(1)  # Return the matched range
        else:
            return match.group(5)  # Return the single value (e.g., "220 volt")

    #if match:
       # return match.group(0)  # Return the matched value
    return ""  # Return empty if no valid value found

In [None]:
# Parsing function for all entities
def parse_entities_from_text(text, entity_unit_map):
    parsed_entities = {}

    # Loop through each entity type
    for entity_name in entity_unit_map.keys():
        entity_value = parse_entity_value(text, entity_name)
        parsed_entities[entity_name] = entity_value

    return parsed_entities

# **LOOPS**

# **EXTRACT STRINGS**

In [None]:
# Iterate through each preprocessed image and extract text
for filename in image_files:
        index = filename.split(".")[0]  # Assuming the file naming format is index.jpg

        image_path = os.path.join(preprocessed_image_folder, filename)

        # Open the image and use pytesseract to extract text
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)

        extracted_data[index] = text
        print(f"Extracted text for {index}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extracted text for 126187
Extracted text for 126188
Extracted text for 126189
Extracted text for 126190
Extracted text for 126191
Extracted text for 126192
Extracted text for 126193
Extracted text for 126194
Extracted text for 126195
Extracted text for 126196
Extracted text for 126197
Extracted text for 126198
Extracted text for 126199
Extracted text for 126200
Extracted text for 126201
Extracted text for 126202
Extracted text for 126203
Extracted text for 126204
Extracted text for 126205
Extracted text for 126206
Extracted text for 126207
Extracted text for 126208
Extracted text for 126209
Extracted text for 126210
Extracted text for 126211
Extracted text for 126212
Extracted text for 126213
Extracted text for 126214
Extracted text for 126215
Extracted text for 126216
Extracted text for 126217
Extracted text for 126218
Extracted text for 126219
Extracted text for 126220
Extracted text for 126221
Extracted text for 126222

In [None]:
len(extracted_data)

131187

In [None]:
save_extracted_data_to_csv(extracted_data, extracted_data_file)

# **PARSE STRINGS**

In [None]:
# Assuming extracted_data is already available
for index, extracted_text in extracted_data.items():
    # Clean and standardize the text
    cleaned_text = clean_text(extracted_text)
    #print(index,cleaned_text)
    # Proceed with further entity parsing
    parsed_entities = parse_entities_from_text(cleaned_text, entity_unit_map)
    #print(index,parsed_entities)
    # Store the result
    parsed_results[index] = parsed_entities

In [None]:
parsed_results

{'0': {'width': '91.44 centimetre',
  'depth': '91.44 centimetre',
  'height': '91.44 centimetre',
  'item_weight': '',
  'maximum_weight_recommendation': '',
  'voltage': '',
  'wattage': '',
  'item_volume': ''},
 '1': {'width': '',
  'depth': '',
  'height': '',
  'item_weight': '',
  'maximum_weight_recommendation': '',
  'voltage': '',
  'wattage': '',
  'item_volume': ''},
 '2': {'width': '',
  'depth': '',
  'height': '',
  'item_weight': '',
  'maximum_weight_recommendation': '',
  'voltage': '',
  'wattage': '',
  'item_volume': ''},
 '3': {'width': '',
  'depth': '',
  'height': '',
  'item_weight': '',
  'maximum_weight_recommendation': '',
  'voltage': '',
  'wattage': '',
  'item_volume': ''},
 '4': {'width': '',
  'depth': '',
  'height': '',
  'item_weight': '',
  'maximum_weight_recommendation': '',
  'voltage': '',
  'wattage': '',
  'item_volume': ''},
 '5': {'width': '',
  'depth': '',
  'height': '',
  'item_weight': '',
  'maximum_weight_recommendation': '',
  'vol

In [None]:
len(parsed_results)

131187

# **ADJUST INDICES**

In [None]:
# Create an empty dictionary to store the new values
a_parsed_results = {}

# Get a list of dictionary values
values = list(parsed_results.values())

# Loop through the indices and create the new dictionary
for i in range(len(num_list)):
    a_parsed_results[num_list[i]] = values[i]

# **CSV**

In [None]:
# Iterate over the parsed_results (index: parsed_entities)
for index, entities in a_parsed_results.items():
    # Initialize the final prediction as an empty string
    final_prediction = ""

    # Check what entity is expected for this index
    if int(index) in index_entity_map_keys:
        expected_entity = index_entity_map[int(index)]

        # Check if the expected entity has a value in parsed_results
        if expected_entity in entities and entities[expected_entity]:
            # Use the value from parsed_results
            final_prediction = entities[expected_entity]
            #print(final_prediction)

    # Add the index and final prediction to the output data
    output_data_rows.append([int(index), final_prediction])

In [None]:
output_data_rows

[[0, '91.44 centimetre'],
 [1, '42 centimetre'],
 [2, '42 centimetre'],
 [3, '42 centimetre'],
 [4, '10.50 centimetre'],
 [5, '10.50 centimetre'],
 [6, '10.50 centimetre'],
 [7, '3.56 centimetre'],
 [8, '40 centimetre'],
 [9, '40 centimetre'],
 [10, ''],
 [11, ''],
 [12, ''],
 [13, ''],
 [14, ''],
 [15, ''],
 [16, ''],
 [17, ''],
 [18, ''],
 [19, '208240 volt'],
 [20, ''],
 [21, ''],
 [22, ''],
 [23, '185 centimetre'],
 [24, '185 centimetre'],
 [25, ''],
 [26, ''],
 [27, ''],
 [28, '9.5 kilogram'],
 [29, ''],
 [30, '0 ton'],
 [31, ''],
 [32, ''],
 [33, ''],
 [34, ''],
 [35, ''],
 [36, '194.3 centimetre'],
 [37, ''],
 [38, ''],
 [39, '4 inch'],
 [40, '4 inch'],
 [41, ''],
 [42, '48 inch'],
 [43, '48 inch'],
 [44, '60 centimetre'],
 [45, '5.8 inch'],
 [46, '5.8 inch'],
 [47, '5.8 inch'],
 [48, '35 inch'],
 [49, '35 inch'],
 [50, '35 inch'],
 [51, ''],
 [52, ''],
 [53, '7.56 inch'],
 [54, '7.56 inch'],
 [55, '1276 metre'],
 [56, ''],
 [57, '5.4 inch'],
 [58, '5.4 inch'],
 [59, ''],
 [60, 

In [None]:
# Write the output to a CSV file
output_csv_file = 'output_pred.csv'

with open(output_csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['index', 'prediction'])

    # Write each row from the output_data
    writer.writerows(output_data_rows)

print(f"Output CSV file saved as {output_csv_file}.")