In [5]:
import re
import pandas as pd

# Normalization and priority map based on entity_name
entity_unit_map = {
    "width": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "depth": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "height": ["centimetre", "foot", "inch", "metre", "millimetre", "yard"],
    "item_weight": ["kilogram", "gram", "milligram", "microgram", "ounce", "pound", "ton"],
    "maximum_weight_recommendation": ["kilogram", "gram", "milligram", "microgram", "ounce", "pound", "ton"],
    "voltage": ["volt", "millivolt", "kilovolt"],
    "wattage": ["watt", "kilowatt"],
    "item_volume": ["litre", "millilitre", "gallon", "cubic foot", "fluid ounce", "cup", "centilitre", "pint", "quart", "cubic inch"]
}

# Function to normalize and prioritize units
def extract_and_normalize_units(entity_name, text):
    # Define normalization mapping for various units
    normalization_map = {
        'g': 'gram', 'grams': 'gram', 'mg': 'milligram', 'kg': 'kilogram',
        'oz': 'ounce', 'lbs': 'pound', 'lb': 'pound', 'cm': 'centimetre',
        'mm': 'millimetre', 'm': 'metre', 'in': 'inch', '"': 'inch',
        'ft': 'foot', 'v': 'volt', 'w': 'watt', 'kw': 'kilowatt', 'ml': 'millilitre',
        'l': 'litre', 'cu ft': 'cubic foot', 'cu in': 'cubic inch', 'hp': 'horsepower',
        'mah': 'milliampere hour'
    }

    # Regex to find numbers followed by units
    pattern = re.compile(r'(\d+\.?\d*)\s?(cm|in|mm|ft|kg|g|mg|lb|lbs|oz|v|w|kw|l|ml|cu ft|cu in|"|hp|mah)', re.IGNORECASE)

    # Normalize and prioritize the matches
    matches = pattern.findall(text)
    
    if not matches:
        return "nan"

    allowed_units = entity_unit_map.get(entity_name, [])

    selected_value = None
    selected_unit = None

    for value, unit in matches:
        normalized_unit = normalization_map.get(unit.lower(), unit.lower())
        if normalized_unit in allowed_units:
            if selected_unit is None or allowed_units.index(normalized_unit) < allowed_units.index(selected_unit):
                selected_value = value
                selected_unit = normalized_unit

    # Return the prioritized value and its corresponding unit
    if selected_value and selected_unit:
        return f"{selected_value} {selected_unit}"
    else:
        return "nan"

# Function to process CSV files
def process_csv_file(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Apply extraction and normalization to each row
    df['prediction'] = df.apply(lambda row: extract_and_normalize_units(row['entity_name'], str(row['prediction'])), axis=1)

    return df[['index', 'prediction']]




In [6]:
# Example usage
df = process_csv_file('final_final_output_with_predictions.csv')


In [7]:
df.to_csv('last_submission.csv', index=False)

In [8]:
len(df)

131187