In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Define the dataset folder path
DATASET_FOLDER = './dataset/'

# Load the datasets
train_df = pd.read_csv(f'{DATASET_FOLDER}/train.csv')
test_df = pd.read_csv(f'{DATASET_FOLDER}/test.csv')
sample_test_df = pd.read_csv(f'{DATASET_FOLDER}/sample_test.csv')

# Display the first few rows of each dataset
print("Train Dataset:")
print(train_df.head())
print("\nTest Dataset:")
print(test_df.head())
print("\nSample Test Dataset:")
print(sample_test_df.head())

Train Dataset:
                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value  
0      500.0 gram  
1         1.0 cup  
2      0.709 gram  
3      0.709 gram  
4  1400 milligram  

Test Dataset:
   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2      2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
3      3  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
4      4  

## Function to extract text from images

In [3]:
def extract_text_from_image(image_url):
    try:
        # Fetch the image from the URL
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()  # Raise an error if the request failed
        
        # Open the image using PIL
        im = Image.open(BytesIO(response.content))
        
        # Convert the image to grayscale (optional, helps OCR in some cases)
        im = im.convert('L')
        
        # Extract text from the image using pytesseract
        text = pytesseract.image_to_string(im, config='--psm 6')  # Adjust config to improve text extraction
#         if text.strip() == "":
#             print(f"No text extracted from {image_url}")
        return text
    except requests.exceptions.RequestException as req_err:
        print(f"Failed to fetch image from URL {image_url}: {req_err}")
        return ""  # Return empty string on request error
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return ""  # Return empty string on other errors

In [4]:
# Function to process dataset in parallel
def extract_text_from_dataset_parallel(df, image_column, num_threads=8):
    # Create a new column 'extracted_text' to store the extracted text
    df['extracted_text'] = None
    
    # Function to process a row and extract the text
    def process_row(row):
        return extract_text_from_image(getattr(row, image_column))  # Access attribute by name
    
    # Using ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Apply the processing function to each row in parallel
        extracted_texts = list(tqdm(executor.map(process_row, df.itertuples(), chunksize=100), total=len(df)))

    # Update the DataFrame with the extracted text
    df['extracted_text'] = extracted_texts
    return df

In [5]:
# Apply tqdm for progress tracking
tqdm.pandas()

In [None]:
test_df = extract_text_from_dataset_parallel(test_df, 'image_link', num_threads=8)


In [None]:
# Save the updated DataFrames to new CSV files
test_df.to_csv('test_with_extracted_text.csv', index=False)


# Extract numeric values and units 

In [5]:
test_with_extracted_text_df = pd.read_csv('test_with_extracted_text.csv')

  test_with_extracted_text_df = pd.read_csv('test_with_extracted_text.csv')


In [6]:
test_with_extracted_text_df

Unnamed: 0,index,image_link,group_id,entity_name,extracted_text
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,oo _ | 2\n
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,"Size Width Length\nOne Size 42cm/16.54"" 200cm/..."
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,"Size Width Length\nOne Size 42cm/16.54"" 200cm/..."
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,"Size Width Length\nOne Size 42cm/16.54"" 200cm/..."
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,"Size Width Length\nOne Size 10.50em/4.13"" 90cm..."
...,...,...,...,...,...
131182,131283,https://m.media-amazon.com/images/I/A1rVsIzEtk...,721522,maximum_weight_recommendation,
131183,131284,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,item_weight,
131184,131285,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,maximum_weight_recommendation,
131185,131286,https://m.media-amazon.com/images/I/A1tnTUPyr7...,853009,item_weight,


In [9]:
import re
from src.constants import entity_unit_map

def extract_entity_value_and_unit(entity_name, text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()

    # Patterns for different entity types
    patterns = {
        'width': r'(?:width|w|wide)?\s*(\d+(?:\.\d+)?)\s*(cm|centimetre|foot|ft|inch|in|metre|m|millimetre|mm|yard|yd)',
        'depth': r'(?:depth|d|deep)?\s*(\d+(?:\.\d+)?)\s*(cm|centimetre|foot|ft|inch|in|metre|m|millimetre|mm|yard|yd)',
        'height': r'(?:height|h|tall)?\s*(\d+(?:\.\d+)?)\s*(cm|centimetre|foot|ft|inch|in|metre|m|millimetre|mm|yard|yd)',
        'item_weight': r'(?:weight|wt)?\s*(\d+(?:\.\d+)?)\s*(g|gram|kg|kilogram|mcg|microgram|mg|milligram|oz|ounce|lb|pound|ton)',
        'maximum_weight_recommendation': r'(?:max|maximum|weight)?\s*(\d+(?:\.\d+)?)\s*(g|gram|kg|kilogram|mcg|microgram|mg|milligram|oz|ounce|lb|pound|ton)',
        'voltage': r'(\d+(?:\.\d+)?)\s*(kv|kilovolt|mv|millivolt|v|volt)',
        'wattage': r'(\d+(?:\.\d+)?)\s*(kw|kilowatt|w|watt)',
        'item_volume': r'(\d+(?:\.\d+)?)\s*(cl|centilitre|cu ft|cubic foot|cu in|cubic inch|dl|decilitre|fl oz|fluid ounce|gal|gallon|imp gal|imperial gallon|l|litre|ml|millilitre|pt|pint|qt|quart)'
    }
    
    # General fallback pattern
    pattern = patterns.get(entity_name, r'(\d+(?:\.\d+)?)\s*(\w+)')
    matches = re.findall(pattern, text)

    if matches:
        # Sort matches by value to get the largest one
        matches.sort(key=lambda x: float(x[0]), reverse=True)
        value, unit = matches[0]
        
        # Map abbreviated units to full names
        unit_mapping = {
            'cm': 'centimetre', 'ft': 'foot', 'in': 'inch', 'm': 'metre', 'mm': 'millimetre', 'yd': 'yard',
            'g': 'gram', 'kg': 'kilogram', 'mcg': 'microgram', 'mg': 'milligram', 'oz': 'ounce', 'lb': 'pound',
            'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt',
            'kw': 'kilowatt', 'w': 'watt',
            'cl': 'centilitre', 'cu ft': 'cubic foot', 'cu in': 'cubic inch', 'dl': 'decilitre', 
            'fl oz': 'fluid ounce', 'gal': 'gallon', 'imp gal': 'imperial gallon', 'l': 'litre', 
            'ml': 'millilitre', 'pt': 'pint', 'qt': 'quart'
        }

        full_unit = unit_mapping.get(unit, unit)

        # Check if unit matches allowed entity units
        if full_unit in entity_unit_map[entity_name]:
            return f"{float(value):.2f} {full_unit}"

    # Handling positional cases (e.g., width first, height second)
    if entity_name in ['width', 'height', 'depth']:
        dimensions = re.findall(r'(\d+(?:\.\d+)?)\s*(cm|inch|mm|metre|foot|yard)', text)
        if dimensions:
            if entity_name == 'width' and len(dimensions) > 0:
                return f"{float(dimensions[0][0]):.2f} {unit_mapping.get(dimensions[0][1], dimensions[0][1])}"
            elif entity_name == 'height' and len(dimensions) > 1:
                return f"{float(dimensions[1][0]):.2f} {unit_mapping.get(dimensions[1][1], dimensions[1][1])}"
            elif entity_name == 'depth' and len(dimensions) > 2:
                return f"{float(dimensions[2][0]):.2f} {unit_mapping.get(dimensions[2][1], dimensions[2][1])}"

    return ""


In [10]:
# Apply this function to the DataFrame
test_with_extracted_text_df['entity_value'] = test_with_extracted_text_df.apply(
    lambda row: extract_entity_value_and_unit(row['entity_name'], row['extracted_text']),
    axis=1
)

In [12]:
# Save the updated DataFrame to a new CSV file
test_with_extracted_text_df.to_csv('test_with_entity_values.csv', index=False)

In [13]:
# Load the processed DataFrame with extracted entity values
processed_df = pd.read_csv('test_with_entity_values.csv')

#  Ensure that the output follows the format "x unit" and matches the test dataset indices
output_df = processed_df[['index', 'entity_value']].rename(columns={'entity_value': 'prediction'})


  processed_df = pd.read_csv('test_with_entity_values.csv')


In [14]:
#  Handle any missing values in the predictions column (fill empty predictions with "")
output_df['prediction'].fillna("", inplace=True)


In [15]:
#  Save the output file in the correct format for submission
output_filename = 'test_out.csv'
output_df.to_csv(output_filename, index=False)


In [16]:
import os
import subprocess


test_filename = r'C:\Users\DELL\Downloads\66e31d6ee96cd_student_resource_3\student_resource 3\dataset\test.csv'
output_filename = r'C:\Users\DELL\Downloads\66e31d6ee96cd_student_resource_3\student_resource 3\test_out.csv'

# Check if the test.csv file exists
if os.path.exists(test_filename):
    # Run the sanity checker using subprocess to capture output
    command = f'python src/sanity.py --test_filename "{test_filename}" --output_filename "{output_filename}"'
    process = subprocess.run(command, shell=True, capture_output=True, text=True)

    # Print the output and any error messages
    print("Sanity Checker Output:", process.stdout)
    print("Sanity Checker Errors:", process.stderr)
    print("Return Code:", process.returncode)
else:
    print(f"Error: File {test_filename} not found.")


Sanity Checker Output: Parsing successfull for file: C:\Users\DELL\Downloads\66e31d6ee96cd_student_resource_3\student_resource 3\test_out.csv

Sanity Checker Errors: 
Return Code: 0
