In [None]:
!pip install paddlepaddle-gpu==2.6.2 -f https://www.paddlepaddle.org.cn/whl/cu118.html
!pip install paddleocr
!pip install tqdm

In [None]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR
from tqdm import tqdm
import csv

# Initialize PaddleOCR with GPU support
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Set the language model as needed

# Create directories if they do not exist
os.makedirs('/content/download', exist_ok=True)
os.makedirs('/content/processed', exist_ok=True)

def download_images(image_links, batch_number):
    """Download images from given links and save them to the specified folder."""
    image_paths = []
    for i, link in enumerate(image_links):
        try:
            response = requests.get(link)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
            img_path = f'/content/download/image_{batch_number*40 + i + 1}.jpg'
            img.save(img_path)
            image_paths.append(img_path)
        except Exception as e:
            print(f"Failed to download {link}: {e}")
    return image_paths

def process_images():
    """Process images in batches and save results to CSV."""
    # Read the CSV file
    train_df = pd.read_csv('/content/test.csv')
    num_batches = (len(train_df) + 39) // 40

    with open('/content/processed_image.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['image_link', 'group_id', 'entity_name', 'text_gainedfromocr'])

        for batch_number in tqdm(range(num_batches), desc="Processing Batches"):
            # Get the image links for the current batch
            start_idx = batch_number * 40
            end_idx = min(start_idx + 40, len(train_df))
            batch_df = train_df.iloc[start_idx:end_idx]

            # Download images
            image_paths = download_images(batch_df['image_link'].tolist(), batch_number)

            # Process images with PaddleOCR
            for i, img_path in enumerate(image_paths):
                try:
                    result = ocr.ocr(img_path, cls=True)

                    # Check and handle OCR results
                    if result is None or not result:
                        text = ""
                    else:
                        text = ' '.join([line[1][0] for line in result[0]])

                    # Write the results to the CSV
                    row = [
                        batch_df.iloc[i]['image_link'],
                        batch_df.iloc[i]['group_id'],
                        batch_df.iloc[i]['entity_name'],
                        text
                    ]
                    writer.writerow(row)

                except Exception as e:
                    print(f"Error during OCR processing for image {img_path}: {e}")
                    # Write empty result in case of error
                    row = [
                        batch_df.iloc[i]['image_link'],
                        batch_df.iloc[i]['group_id'],
                        batch_df.iloc[i]['entity_name'],
                        ""
                    ]
                    writer.writerow(row)

                # Remove the image after processing
                os.remove(img_path)

if __name__ == '__main__':
    process_images()


In [None]:
import re
import pandas as pd

# Unit conversion dictionary (to centimetres, grams, etc.)
unit_conversion = {
    'cm': 1,
    'centimetre': 1,
    'centimeter': 1,
    'mm': 0.1,
    'millimeter': 0.1,
    'inch': 2.54,
    'in': 2.54,
    'foot': 30.48,
    'ft': 30.48,
    'yard': 91.44,
    "'": 2.54,  # sometimes inches are written as single quotes
    'm': 100,
    'metre': 100,
    'g': 1,
    'gram': 1,
    'kg': 1000,
    'kilogram': 1000,
    'lb': 453.59237,
    'lbs': 453.59237,
    'oz': 28.3495,
    'w': 1,  # Wattage doesn't need conversion
    'watt': 1
}

# Regular expression patterns for different units
value_unit_pattern = re.compile(r'(\d*\.?\d+)\s*(cm|mm|in|inch|foot|ft|yard|m|\'|metre|centimetre|centimeter|g|gram|kg|kilogram|lb|lbs|oz|w|watt)', re.IGNORECASE)

# Define the entity-unit map
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint",
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Function to convert unit to centimetre or gram and return the value
def convert_to_cm_or_gram(value, unit):
    unit = unit.lower().strip()
    if unit in unit_conversion:
        try:
            return float(value) * unit_conversion[unit]
        except ValueError:
            # Skip invalid numeric strings like '240 V 16 A'
            return None
    return None

# Function to convert watts from volts and amps
def convert_wattage(volts, amps):
    try:
        volts = float(volts)
        amps = float(amps)
        return volts * amps
    except ValueError:
        return None

# Function to extract and compare dimensions while keeping original values and units
def extract_dimensions(text):
    if not text or not isinstance(text, str):
        return None, None, None, None  # Handle null or non-string values

    # Extract all values and units
    matches = value_unit_pattern.findall(text)

    # Prepare lists for original values and converted values
    original_values = []
    values_in_cm_or_gram = []

    # Convert all extracted values to centimetres or grams for comparison, but keep the original values
    for value, unit in matches:
        original_values.append((value, unit))
        converted_value = convert_to_cm_or_gram(value, unit)
        if converted_value is not None:
            values_in_cm_or_gram.append(converted_value)

    # Check for volts and amps in various formats (for wattage calculation)
    volt_amp_matches = re.findall(r'(\d*\.?\d+)\s*(V|volt|volts|v)\s*[\D]*\s*(\d*\.?\d+)\s*(A|amp|amps|a)', text, re.IGNORECASE)
    if volt_amp_matches:
        volts, _, amps, _ = volt_amp_matches[0]
        wattage_value = convert_wattage(volts, amps)
        if wattage_value is not None:
            values_in_cm_or_gram.append(wattage_value)
            original_values.append((f"{volts} V {amps} A", 'watt'))

    # Sort the converted values to determine height, width, depth, and weight (highest, medium, lowest)
    sorted_values_in_cm_or_gram = sorted(values_in_cm_or_gram, reverse=True)  # Reverse to get highest first

    # If fewer than 4 values, repeat the smaller values to fill width, depth, and weight
    if len(sorted_values_in_cm_or_gram) >= 4:
        height, width, depth, weight = sorted_values_in_cm_or_gram[0], sorted_values_in_cm_or_gram[1], sorted_values_in_cm_or_gram[2], sorted_values_in_cm_or_gram[3]
    elif len(sorted_values_in_cm_or_gram) == 3:
        height, width, depth, weight = sorted_values_in_cm_or_gram[0], sorted_values_in_cm_or_gram[1], sorted_values_in_cm_or_gram[2], sorted_values_in_cm_or_gram[0]
    elif len(sorted_values_in_cm_or_gram) == 2:
        height, width, depth, weight = sorted_values_in_cm_or_gram[0], sorted_values_in_cm_or_gram[1], sorted_values_in_cm_or_gram[1], sorted_values_in_cm_or_gram[1]
    elif len(sorted_values_in_cm_or_gram) == 1:
        height, width, depth, weight = sorted_values_in_cm_or_gram[0], sorted_values_in_cm_or_gram[0], sorted_values_in_cm_or_gram[0], sorted_values_in_cm_or_gram[0]
    else:
        return None, None, None, None

    # Match the original values with the sorted converted values and form the result
    height_original = next((v for v in original_values if convert_to_cm_or_gram(*v) == height), None)
    width_original = next((v for v in original_values if convert_to_cm_or_gram(*v) == width), None)
    depth_original = next((v for v in original_values if convert_to_cm_or_gram(*v) == depth), None)
    weight_original = next((v for v in original_values if convert_to_cm_or_gram(*v) == weight), None)

    return height_original, width_original, depth_original, weight_original

# Function to map the extracted values to the desired unit for each entity
def map_to_entity_unit(entity_name, original_unit):
    allowed_units = entity_unit_map.get(entity_name, set())
    # If original unit is in allowed units, return it, otherwise return the first allowed unit
    if original_unit in allowed_units:
        return original_unit
    elif allowed_units:
        return next(iter(allowed_units))  # Return any allowed unit as fallback
    return original_unit

# Function to process the CSV and extract the required information
def process_csv(file_path):
    # Load the CSV into a pandas dataframe
    df = pd.read_csv(file_path)

    # Create empty columns for the new results
    df['newcol'] = None
    df['unit_col'] = None

    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # Extract dimensions from the text_gainedfromocr
        height_original, width_original, depth_original, weight_original = extract_dimensions(row.get('text_gainedfromocr'))

        # Determine which value to store based on entity_name and map it to appropriate unit
        if row['entity_name'] == 'height' and height_original:
            df.at[index, 'newcol'] = float(height_original[0])
            df.at[index, 'unit_col'] = map_to_entity_unit(row['entity_name'], height_original[1])
        elif row['entity_name'] == 'width' and width_original:
            df.at[index, 'newcol'] = float(width_original[0])
            df.at[index, 'unit_col'] = map_to_entity_unit(row['entity_name'], width_original[1])
        elif row['entity_name'] == 'depth' and depth_original:
            df.at[index, 'newcol'] = float(depth_original[0])
            df.at[index, 'unit_col'] = map_to_entity_unit(row['entity_name'], depth_original[1])
        elif row['entity_name'] == 'item_weight' and weight_original:
            df.at[index, 'newcol'] = float(weight_original[0])
            df.at[index, 'unit_col'] = map_to_entity_unit(row['entity_name'], weight_original[1])
        elif row['entity_name'] == 'wattage':
            # Handle wattage directly from the regex for wattage
            wattage_matches = re.findall(r'(\d*\.?\d+)\s*(w|watt)', str(row.get('text_gainedfromocr')), re.IGNORECASE)
            if wattage_matches:
                wattage_original = wattage_matches[0]  # Get the first wattage match
                df.at[index, 'newcol'] = float(wattage_original[0])
                df.at[index, 'unit_col'] = map_to_entity_unit(row['entity_name'], wattage_original[1])
            else:
                # Handle volt and amp conversion
                volt_amp_matches = re.findall(r'(\d*\.?\d+)\s*(V|volt|volts|v)\s*[\D]*\s*(\d*\.?\d+)\s*(A|amp|amps|a)', str(row.get('text_gainedfromocr')), re.IGNORECASE)
                if volt_amp_matches:
                    volts, _, amps, _ = volt_amp_matches[0]
                    wattage_value = convert_wattage(volts, amps)
                    if wattage_value is not None:
                        df.at[index, 'newcol'] = float(wattage_value)
                        df.at[index, 'unit_col'] = map_to_entity_unit(row['entity_name'], 'watt')

    # Save the dataframe with the new columns to a new CSV file
    output_file = 'processed_output.csv'
    df.to_csv(output_file, index=False)
    return output_file

# Usage
csv_file = '/content/processed_image.csv'  # Provide the path to the CSV file
output_file = process_csv(csv_file)
print(f'Processed data saved to: {output_file}')


In [None]:
df=pd.read_csv('/content/processed_output.csv')
df.info()

In [1]:
import pandas as pd

def process_predictions(input_file, output_file):
    # Load the input CSV file
    df = pd.read_csv(input_file)

    # Create an empty list to store predictions
    predictions = []

    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # Extract the float value (newcol) and unit (unit_col)
        value = row.get('newcol')
        unit = row.get('unit_col')

        # Initialize the prediction string as empty
        prediction = ""

        # Check if both value and unit are available and valid
        if pd.notnull(value) and pd.notnull(unit):
            try:
                # Format the value as a float in standard notation
                formatted_value = f"{float(value):.6g}"  # This ensures it is in standard float notation
                # Concatenate value and unit with a space
                prediction = f"{formatted_value} {unit.strip()}"
            except ValueError:
                # If conversion to float fails, leave prediction as empty
                prediction = ""

        # Append the prediction to the list
        predictions.append(prediction)

    # Create a new dataframe for output
    output_df = pd.DataFrame({
        'index': df['index'],
        'prediction': predictions
    })

    # Save the output dataframe to a new CSV file
    output_df.to_csv(output_file, index=False)

# File paths
input_file = 'test_with_val_final.csv'  # Input CSV file path
output_file = 'predictions_output.csv'  # Output CSV file path

# Process the input file and save the predictions
process_predictions(input_file, output_file)

print(f"Processed predictions saved to: {output_file}")


Processed predictions saved to: predictions_output.csv
