In [5]:
import os
import numpy as np
import pandas as pd
from PIL import Image, UnidentifiedImageError

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import re



# Preprocess and resize image
def process_image(image, size=(244, 244)):
    try:
        # Resize the image
        image_resized = image.resize(size)
        # Convert to numpy array
        image_array = np.array(image_resized)
        return image_array
    except Exception as e:
        print(f"Failed to process image: {e}")
        return None

# OCR function to extract text from the image
def extract_text_from_image(image):
    try:
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

# Function to load images and apply OCR
def load_images_with_ocr(df, image_dir, max_images, batch_size=100):
    total_images = len(df)
    batches = (total_images + batch_size - 1) // batch_size  # Calculate number of batches
    images = []
    image_arrays = []
    all_images = []
    df = df.head(max_images)

    for batch_index in range(batches):
        batch_df = df.iloc[batch_index * batch_size:(batch_index + 1) * batch_size]
        successfully_loaded = 0
        skipped_files = 0
        total_mapped = 0

        for index, row in batch_df.iterrows():
            total_mapped += 1
            # Extract the filename from the image link
            image_filename = row['image_link'].split('/')[-1]
            image_path = os.path.join(image_dir, image_filename)
            image_path = image_path.replace("\\", "/")

            # Load the image if it exists, else append None
            if os.path.exists(image_path):
                try:
                    with Image.open(image_path) as img:
                        # Resize image
                        image_array = process_image(img)
                        image_arrays.append(image_array)

                        images.append(img.copy())  # Copy the image to avoid keeping the file open
                        successfully_loaded += 1
                except (UnidentifiedImageError, OSError) as e:
                    print(f"Warning: Error opening image file {image_path}: {e}")
                    images.append(None)  # Append None if the image is invalid or corrupted
                    skipped_files += 1
            else:
                print(f"Warning: Image file {image_path} does not exist. Skipping...")
                images.append(None)
                skipped_files += 1
        all_images.extend(images)
        print(f"Total images mapped: {total_mapped}")
        print(f"Successfully loaded images: {successfully_loaded}")
        print(f"Skipped images: {skipped_files}")

    return images, image_arrays

# Load training data
train_df = pd.read_csv(r'C:\Users\Hrushikesh\OneDrive\Desktop\ML Hackathon\student_resource 3\dataset\Book1.csv')
train_image_dir = r'C:\Users\Hrushikesh\OneDrive\Desktop\ML Hackathon\student_resource 3\downloaded_images_2\downloaded_images_2'
MAX_IMAGES = 100

# Load training images
train_images, train_arrays = load_images_with_ocr(train_df, train_image_dir, MAX_IMAGES, 100)

# Add images and arrays to the dataset
mapped_train_dataset = train_df.head(MAX_IMAGES).copy()
mapped_train_dataset['Image'] = train_images
mapped_train_dataset['Image_Arrays'] = train_arrays

# Split 'entity_value' into numeric value and unit
mapped_train_dataset[['entity_number', 'entity_unit']] = mapped_train_dataset['entity_value'].str.extract(r'([0-9.]+)\s*(\w+)')
mapped_train_dataset['entity_number'] = pd.to_numeric(mapped_train_dataset['entity_number'], errors='coerce')

# Drop rows with NaN values in 'entity_number' after conversion
mapped_train_dataset = mapped_train_dataset.dropna(subset=['entity_number'])

# Display the resulting dataframe
print(mapped_train_dataset[['entity_value', 'entity_number', 'entity_unit']])

# One-hot encode the 'entity_name' and 'entity_unit' columns
encoder = OneHotEncoder(sparse_output=False)

# Encode entity_name
categorical_column = mapped_train_dataset[['entity_name']]
column_encoded = encoder.fit_transform(categorical_column)
column_encoded_df = pd.DataFrame(column_encoded, columns=encoder.get_feature_names_out(['entity_name']))
mapped_train_dataset = pd.concat([mapped_train_dataset.drop(columns=['entity_name']), column_encoded_df], axis=1)

# Encode entity_unit
categorical_column = mapped_train_dataset[['entity_unit']]
column_encoded = encoder.fit_transform(categorical_column)
column_encoded_df = pd.DataFrame(column_encoded, columns=encoder.get_feature_names_out(['entity_unit']))
mapped_train_dataset = pd.concat([mapped_train_dataset.drop(columns=['entity_unit']), column_encoded_df], axis=1)


# Initialize the MinMaxScaler
scaler_entity_number = MinMaxScaler()
# Fit and transform the entity_number column
mapped_train_dataset['entity_number'] = scaler_entity_number.fit_transform(mapped_train_dataset[['entity_number']])


# Convert train_arrays (list) to a NumPy array and flatten the image arrays
train_arrays = np.array(train_arrays)
train_image_arrays_flat = train_arrays.reshape(train_arrays.shape[0], -1)

print(mapped_train_dataset.columns.tolist())

# Define target variables
target_number = mapped_train_dataset['entity_number']
target_unit = mapped_train_dataset.filter(regex='^entity_unit_')

# Drop target columns from features
col_to_drop = ['image_link', 'group_id', 'entity_value', 'Image', 'Image_Arrays', 'entity_number', 'entity_unit']
mapped_train_dataset_dropped = mapped_train_dataset.drop(columns=col_to_drop, errors='ignore')

# Combine features and apply PCA
try:
    # Dimensionality reduction using PCA
    n_components = min(1000, train_image_arrays_flat.shape[0], train_image_arrays_flat.shape[1])
    pca = PCA(n_components=n_components)
    reduced_image_features = pca.fit_transform(train_image_arrays_flat)
    train_features = np.hstack((reduced_image_features, mapped_train_dataset_dropped.to_numpy()))
    print("Shape of train_features:", train_features.shape)
except Exception as e:
    print(f"Error combining features: {e}")

# Ensure train_features is a numeric NumPy array
try:
    train_features = np.array(train_features, dtype=float)
    print("Type of train_features:", type(train_features))
    print("Shape of train_features:", train_features.shape)
except Exception as e:
    print(f"Error converting to float: {e}")

# Check for NaNs and Infs
if np.issubdtype(train_features.dtype, np.number):
    print("NaNs in train_features:", np.any(np.isnan(train_features)))
    print("Infs in train_features:", np.any(np.isinf(train_features)))
else:
    print("train_features contains non-numeric data.")

# Replace NaNs with a specific value (e.g., 0)
train_features = np.nan_to_num(train_features)

# Proceed with train/test split and model fitting
X_train, X_valid, y_train_number, y_valid_number = train_test_split(train_features, target_number, test_size=0.2, random_state=42)
X_train, X_valid, y_train_unit, y_valid_unit = train_test_split(train_features, target_unit, test_size=0.2, random_state=42)

# Train and evaluate models
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train_number)
y_pred_number = regressor.predict(X_valid)
print("Mean Squared Error for entity_number:", mean_squared_error(y_valid_number, y_pred_number))

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train_unit)
y_pred_unit = classifier.predict(X_valid)
print("Accuracy for entity_unit:", accuracy_score(y_valid_unit, y_pred_unit))

Total images mapped: 100
Successfully loaded images: 100
Skipped images: 0
      entity_value  entity_number entity_unit
0       500.0 gram        500.000        gram
1          1.0 cup          1.000         cup
2       0.709 gram          0.709        gram
3       0.709 gram          0.709        gram
4   1400 milligram       1400.000   milligram
..             ...            ...         ...
95       49.0 watt         49.000        watt
96   500 milligram        500.000   milligram
97   500 milligram        500.000   milligram
98       16.0 gram         16.000        gram
99      8 kilogram          8.000    kilogram

[100 rows x 3 columns]
['image_link', 'group_id', 'entity_value', 'Image', 'Image_Arrays', 'entity_number', 'entity_name_item_volume', 'entity_name_item_weight', 'entity_name_voltage', 'entity_name_wattage', 'entity_unit_cubic', 'entity_unit_cup', 'entity_unit_gallon', 'entity_unit_gram', 'entity_unit_kilogram', 'entity_unit_milligram', 'entity_unit_millilitre', 'entity

In [6]:
mapped_train_dataset

Unnamed: 0,image_link,group_id,entity_value,Image,Image_Arrays,entity_number,entity_name_item_volume,entity_name_item_weight,entity_name_voltage,entity_name_wattage,...,entity_unit_cup,entity_unit_gallon,entity_unit_gram,entity_unit_kilogram,entity_unit_milligram,entity_unit_millilitre,entity_unit_ounce,entity_unit_pound,entity_unit_volt,entity_unit_watt
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,500.0 gram,<PIL.Image.Image image mode=RGB size=1600x1600...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0.099937,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,1.0 cup,<PIL.Image.Image image mode=RGB size=1200x1200...,"[[[155, 190, 209], [153, 188, 207], [153, 188,...",0.000130,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,0.709 gram,<PIL.Image.Image image mode=RGB size=1081x1081...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0.000072,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,0.709 gram,<PIL.Image.Image image mode=RGB size=1081x1081...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0.000072,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,1400 milligram,<PIL.Image.Image image mode=RGB size=1500x1500...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0.279950,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,https://m.media-amazon.com/images/I/71XK5d3Oh9...,416664,49.0 watt,<PIL.Image.Image image mode=RGB size=2560x2560...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0.009731,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
96,https://m.media-amazon.com/images/I/61kyBEJYDe...,459516,500 milligram,<PIL.Image.Image image mode=RGB size=1500x1500...,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0.099937,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
97,https://m.media-amazon.com/images/I/71uQmsTESv...,459516,500 milligram,<PIL.Image.Image image mode=RGB size=1500x1500...,"[[[253, 253, 253], [254, 254, 254], [254, 254,...",0.099937,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
98,https://m.media-amazon.com/images/I/71jG8BOi4W...,241608,16.0 gram,<PIL.Image.Image image mode=RGB size=1600x1600...,"[[[227, 228, 230], [228, 229, 231], [228, 229,...",0.003130,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
