In [None]:
## Code to extract all text from images.

import pytesseract
from PIL import Image
import pandas as pd
import requests
from io import BytesIO
import os

# Configure the path to the Tesseract executable if necessary
# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Ethan Hunt\AppData\Local\Programs\Tesseract-OCR'  # Update this path if different

def extract_text_from_image(image):
    try:
        # Use Tesseract to do OCR on the image
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error processing image: {e}")
        return ""

def download_image(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        img = Image.open(BytesIO(response.content))
        return img
    except Exception as e:
        print(f"Error downloading image from {image_url}: {e}")
        return None

def process_images_from_csv(input_csv, output_csv):
    # Read CSV file containing image URLs
    df = pd.read_csv(input_csv)
    
    # Prepare a list to hold the extracted data
    data = []
    
    # Process each image URL in the DataFrame
    for index, row in df.iterrows():
        image_url = row['image_link']
        img = download_image(image_url)
        if img:
            extracted_text = extract_text_from_image(img)
            data.append({'image_link': image_url, 'extracted_text': extracted_text})
    
    # Create a DataFrame from the data list
    result_df = pd.DataFrame(data)
    
    # Save the DataFrame to a CSV file
    try:
        result_df.to_csv(output_csv, index=False)
    except PermissionError as pe:
        print(f"PermissionError: Unable to save the CSV file. {pe}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Usage
if __name__ == "__main__":
    input_csv = r'test.csv'  # Path to your CSV file containing image links
    output_csv = r'output.csv'  # Full path to the output CSV file
    process_images_from_csv(input_csv, output_csv)


In [13]:
## Extracting only numerical values from the extracted text

import pandas as pd
import re

# Define unit abbreviations
unit_abbreviations = {
    'gram': 'g', 'kilogram': 'kg', 'microgram': 'µg', 'milligram': 'mg', 'ounce': 'oz', 'pound': 'lb', 'ton': 't',
    'centimetre': 'cm', 'metre': 'm', 'millimetre': 'mm', 'foot': 'ft', 'inch': 'in', 'yard': 'yd',
    'kilovolt': 'kV', 'millivolt': 'mV', 'volt': 'V',
    'kilowatt': 'kW', 'watt': 'W',
    'centilitre': 'cL', 'decilitre': 'dL', 'fluid ounce': 'fl oz', 'gallon': 'gal', 'imperial gallon': 'imp gal',
    'litre': 'L', 'microlitre': 'µL', 'millilitre': 'mL', 'pint': 'pt', 'quart': 'qt',
    'cubic foot': 'ft³', 'cubic inch': 'in³'
}

# Flatten the unit abbreviations for easy matching
allowed_units = set(unit_abbreviations.keys()).union(set(unit_abbreviations.values()))

def extract_numerical_values(text):
    # Regular expression to match numerical values with units
    # Make sure the unit abbreviation is fully matched and not misinterpreted
    pattern = re.compile(r'(\d+\.?\d*)\s*(' + '|'.join(re.escape(unit) for unit in sorted(allowed_units, key=len, reverse=True)) + r')\b', re.IGNORECASE)
    matches = pattern.findall(text)
    
    # Convert units to abbreviated form
    results = [(value, unit_abbreviations.get(unit.lower(), unit)) for value, unit in matches]
    
    return results

def preprocess_extracted_text(input_csv, output_csv):
    # Read the CSV file
    df = pd.read_csv(input_csv)
    
    # Ensure 'extracted_text' column exists
    if 'extracted_text' not in df.columns:
        raise ValueError("CSV must contain 'extracted_text' column")
    
    # Process each row to extract numerical values with abbreviated units
    results = []
    for _, row in df.iterrows():
        text = row['extracted_text']
        
        # Convert non-string text values to empty strings
        if not isinstance(text, str):
            text = ""
        
        # Extract numerical values and units
        extracted_values = extract_numerical_values(text)
        
        # Format results as a semicolon-separated string
        formatted_values = '; '.join(f'{value} {unit}' for value, unit in extracted_values)
        
        # Append results to the list
        result_row = row.to_dict()
        result_row['numerical_values'] = formatted_values
        results.append(result_row)
    
    # Convert results to DataFrame and save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)

# Example usage
if __name__ == "__main__":
    input_csv = r'D:\NEW AMAZON\output.csv'  # Path to your processed output CSV
    output_csv = r'D:\NEW AMAZON\result2.csv'  # Path to save the final results
    preprocess_extracted_text(input_csv, output_csv)


In [15]:
#Numerical values of a single unit
import pandas as pd
import re

# Define the entity to unit mapping and unit abbreviations
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

unit_abbreviations = {
    'gram': 'g', 'kilogram': 'kg', 'microgram': 'µg', 'milligram': 'mg', 'ounce': 'oz', 'pound': 'lb', 'ton': 't',
    'centimetre': 'cm', 'metre': 'm', 'millimetre': 'mm', 'foot': 'ft', 'inch': 'in', 'yard': 'yd',
    'kilovolt': 'kV', 'millivolt': 'mV', 'volt': 'V',
    'kilowatt': 'kW', 'watt': 'W',
    'centilitre': 'cL', 'decilitre': 'dL', 'fluid ounce': 'fl oz', 'gallon': 'gal', 'imperial gallon': 'imp gal',
    'litre': 'L', 'microlitre': 'µL', 'millilitre': 'mL', 'pint': 'pt', 'quart': 'qt',
    'cubic foot': 'ft³', 'cubic inch': 'in³'
}

# Load the CSV file into a DataFrame
df = pd.read_csv('result2.csv')

# Ensure 'extracted_text' is treated as a string
df['extracted_text'] = df['extracted_text'].fillna('').astype(str)

# Function to extract values based on the entity and unit mapping
def extract_filtered_values(text, units):
    filtered_values = []
    text = str(text)  # Ensure text is a string
    for unit in units:
        # Prepare regex pattern for unit (including abbreviation)
        unit_regex = re.compile(r'(\d+(\.\d+)?)\s*(%s)' % '|'.join(map(re.escape, [unit, unit_abbreviations.get(unit, '')])), re.IGNORECASE)
        found_values = unit_regex.findall(text)
        for value in found_values:
            # value[0] contains the numerical part, and value[2] contains the unit
            filtered_values.append(f"{value[0]} {value[2]}")
    return ', '.join(filtered_values)

# Apply the filtering function to each row in the DataFrame
def process_row(row):
    entity_name = row['entity_name']
    text = row['numerical_values']
    text = str(text)  # Ensure text is a string
    units = entity_unit_map.get(entity_name, set())
    return extract_filtered_values(text, units)

df['filter_values'] = df.apply(process_row, axis=1)

# Save the updated DataFrame back to a CSV file
df.to_csv('new.csv', index=False)


In [16]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('new.csv')

# Ensure 'filter_values' is treated as a string
df['filter_values'] = df['filter_values'].fillna('').astype(str)

# Function to extract the first value from comma-separated values
def get_first_value(value):
    # Split the string by commas and strip any extra spaces
    values = [v.strip() for v in value.split(',')]
    # Return the first value, or an empty string if no values are present
    return values[0] if values else ''

# Apply the function to each row in the 'filter_values' column
df['first_value'] = df['filter_values'].apply(get_first_value)

# Save the updated DataFrame back to a new CSV file
df.to_csv('new2.csv', index=False)


In [18]:
import pandas as pd

def add_index_column(csv_file):
    # Read the existing CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Add an index column as the leftmost column
    df.insert(0, 'Index', range(0, len(df)))
    
    # Save the DataFrame with the index column to a new CSV file
    new_csv_file = 'new2_with_index.csv'  # You can use the same filename to overwrite or choose a new one
    df.to_csv(new_csv_file, index=False)
    
    print(f"Index column added and saved to {new_csv_file}")

# Usage
if __name__ == "__main__":
    csv_file = 'new2.csv'  # Path to your CSV file
    add_index_column(csv_file)


Index column added and saved to new2_with_index.csv


In [19]:
import pandas as pd

# Load your DataFrame (for example, from a CSV file)
df = pd.read_csv('new2_with_index.csv')

# List of column names to drop
columns_to_drop = ['image_link', 'entity_name', 'numerical_values','filter_values','extracted_text']

# Drop columns from the DataFrame
df = df.drop(columns=columns_to_drop)

# Alternatively, to drop columns in place without creating a new DataFrame
# df.drop(columns=columns_to_drop, inplace=True)

# Save the updated DataFrame back to a CSV file (optional)
df.to_csv('updated_file.csv', index=False)

In [25]:
df = pd.read_csv('updated_file.csv')
df.head()

Unnamed: 0,Index,prediction
0,0,42 cm
1,1,40 cm
2,2,
3,3,
4,4,42 cm


In [32]:
import pandas as pd

# Define the mapping from abbreviated units to full forms
unit_abbreviations = {
    'g': 'gram', 'kg': 'kilogram', 'µg': 'microgram', 'mg': 'milligram', 'oz': 'ounce', 'lb': 'pound', 't': 'ton',
    'cm': 'centimetre', 'm': 'metre', 'mm': 'millimetre', 'ft': 'foot', 'in': 'inch', 'yd': 'yard',
    'kV': 'kilovolt', 'mV': 'millivolt', 'V': 'volt',
    'kW': 'kilowatt', 'W': 'watt',
    'cL': 'centilitre', 'dL': 'decilitre', 'fl oz': 'fluid ounce', 'gal': 'gallon', 'imp gal': 'imperial gallon',
    'L': 'litre', 'µL': 'microlitre', 'mL': 'millilitre', 'pt': 'pint', 'qt': 'quart',
    'ft³': 'cubic foot', 'in³': 'cubic inch'
}

def replace_units(prediction):
    if isinstance(prediction, str):  # Ensure the value is a string
        for abbr, full_form in unit_abbreviations.items():
            if abbr in prediction:
                return prediction.replace(abbr, full_form)
    return prediction

def update_units_in_csv(input_csv, output_csv):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)
    
    # Convert 'prediction' column to string to handle any non-string values
    df['prediction'] = df['prediction'].astype(str)
    
    # Replace abbreviations in the 'prediction' column
    df['prediction'] = df['prediction'].apply(replace_units)
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    
    print(f"Updated CSV file saved as {output_csv}")

# Usage
if __name__ == "__main__":
    input_csv = 'updated_file.csv'  # Path to your input CSV file
    output_csv = 'your_output_file.csv'  # Path to the output CSV file
    update_units_in_csv(input_csv, output_csv)


Updated CSV file saved as your_output_file.csv
