## Importing Section

In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image,UnidentifiedImageError,ImageFile

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from  sklearn.linear_model import LinearRegression
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler



ModuleNotFoundError: No module named 'tensorflow'

## Resize Array Function

In [177]:
def process_image(image, size=(244, 244)):
    try:
        # Resize the image
        image_resized = image.resize(size)
        # Convert to numpy array
        image_array = np.array(image_resized)
        return image_array
    except Exception as e:
        print(f"Failed to process image: {e}")
        return None

## Image Mapping and loading

In [178]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

train_image_dir = '../downloaded_images_train'
test_image_dir = 'downloaded_images_test'
MAX_IMAGES = 100


def load_images(df, image_dir,max_images,batch_size=100 ):
    total_images = len(df)
    batches = (total_images + batch_size - 1) // batch_size  # Calculate number of batches
    all_images = []
    df = df.head(max_images)
    images = []
    image_arrays = []


    for batch_index in range(batches):
        batch_df = df.iloc[batch_index * batch_size:(batch_index + 1) * batch_size]
        successfully_loaded = 0
        total_mapped = 0
        skipped_files = 0
        
        for index, row in batch_df.iterrows():

            total_mapped += 1
            # Extract the filename from the image link
            image_filename = row['image_link'].split('/')[-1]
            image_path = os.path.join(image_dir, image_filename)
            image_path = image_path.replace("\\", "/")
            
            # Load the image if it exists, else append None
            if os.path.exists(image_path):
                try:
                    with Image.open(image_path) as img:
                        image_array = image_resized = process_image(img)
                        image_arrays.append(image_array)
                        images.append(img.copy())  # Copy the image to avoid keeping the file open
                        successfully_loaded += 1
                except (UnidentifiedImageError, OSError) as e:
                    print(f"Warning: Error opening image file {image_path}: {e}")
                    images.append(None)  # Append None if the image is invalid or corrupted
                    skipped_files += 1
            else:
                print(f"Warning: Image file {image_path} does not exist. Skipping...")
                images.append(None)  # Handle cases where the image file is missing
                skipped_files += 1
        
        all_images.extend(images)
        print(f"Total images mapped: {total_mapped}")
        print(f"Successfully loaded images: {successfully_loaded}")
        print(f"Skipped images: {skipped_files}")

    return images,image_arrays

# Load training and test images
train_images,train_arrays = load_images(train_df, train_image_dir, MAX_IMAGES, 100)
test_images,test_arrays= load_images(test_df, test_image_dir, MAX_IMAGES, 100)

Total images mapped: 100
Successfully loaded images: 100
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
Total images mapped: 0
Successfully loaded images: 0
Skipped images: 0
To

## Adding image arrays into df

In [179]:
mapped_train_dataset = train_df.head(100)
mapped_test_dataset = test_df.head(100)
mapped_train_dataset['Image'] = train_images
mapped_train_dataset['Image_Arrays'] = train_arrays
mapped_test_dataset['Image'] = test_images
mapped_test_dataset['Image_Arrays'] = test_arrays

# mapped_train_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_train_dataset['Image'] = train_images
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_train_dataset['Image_Arrays'] = train_arrays
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_test_dataset['Image'] = test_images
A value is trying to be set on a copy of a slice from a DataFrame

## Spliting entity values to numeric and units

In [180]:


# Step 1: Split 'entity_value' into numeric value and unit
mapped_train_dataset[['entity_number', 'entity_unit']] = mapped_train_dataset['entity_value'].str.extract(r'([0-9.]+)\s*(\w+)')

# Step 2: Convert the numeric part to float
mapped_train_dataset['entity_number'] = mapped_train_dataset['entity_number'].astype(float)

# Display the resulting dataframe
print(mapped_train_dataset[['entity_value', 'entity_number', 'entity_unit']])
print(mapped_train_dataset['entity_unit'].unique())


      entity_value  entity_number entity_unit
0       500.0 gram        500.000        gram
1          1.0 cup          1.000         cup
2       0.709 gram          0.709        gram
3       0.709 gram          0.709        gram
4   1400 milligram       1400.000   milligram
..             ...            ...         ...
95       49.0 watt         49.000        watt
96   500 milligram        500.000   milligram
97   500 milligram        500.000   milligram
98       16.0 gram         16.000        gram
99      8 kilogram          8.000    kilogram

[100 rows x 3 columns]
['gram' 'cup' 'milligram' 'kilogram' 'ounce' 'gallon' 'volt' 'watt'
 'pound' 'millilitre' 'cubic']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_train_dataset[['entity_number', 'entity_unit']] = mapped_train_dataset['entity_value'].str.extract(r'([0-9.]+)\s*(\w+)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_train_dataset[['entity_number', 'entity_unit']] = mapped_train_dataset['entity_value'].str.extract(r'([0-9.]+)\s*(\w+)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

## Encodeing categorical values

In [181]:
encoder = OneHotEncoder()
categorical_column = mapped_train_dataset[['entity_name']]
column_encoded = encoder.fit_transform(categorical_column)
column_encoded_df = pd.DataFrame(column_encoded.toarray(), columns=encoder.get_feature_names_out(['entity_name']))
mapped_train_dataset = pd.concat([mapped_train_dataset.drop(columns=['entity_name']),column_encoded_df], axis=1)

categorical_column = mapped_test_dataset[['entity_name']]
column_encoded = encoder.fit_transform(categorical_column)
column_encoded_df = pd.DataFrame(column_encoded.toarray(), columns=encoder.get_feature_names_out(['entity_name']))
mapped_test_dataset = pd.concat([mapped_test_dataset.drop(columns=['entity_name']),column_encoded_df], axis=1)

categorical_column = mapped_train_dataset[['entity_unit']]
column_encoded = encoder.fit_transform(categorical_column)
column_encoded_df = pd.DataFrame(column_encoded.toarray(), columns=encoder.get_feature_names_out(['entity_unit']))
mapped_train_dataset = pd.concat([mapped_train_dataset.drop(columns=['entity_unit']),column_encoded_df], axis=1)

# mapped_train_dataset
print(mapped_train_dataset.columns)

print(mapped_train_dataset.shape)


Index(['image_link', 'group_id', 'entity_value', 'Image', 'Image_Arrays',
       'entity_number', 'entity_name_item_volume', 'entity_name_item_weight',
       'entity_name_voltage', 'entity_name_wattage', 'entity_unit_cubic',
       'entity_unit_cup', 'entity_unit_gallon', 'entity_unit_gram',
       'entity_unit_kilogram', 'entity_unit_milligram',
       'entity_unit_millilitre', 'entity_unit_ounce', 'entity_unit_pound',
       'entity_unit_volt', 'entity_unit_watt'],
      dtype='object')
(100, 21)


## Normalizing Image vector arrays

In [182]:
# Assuming the image vectors are in a column called 'image_vector' and are in a numpy array format

# Assuming 'image_vector' is a 4D array for each image
# Reshape each image to be a flat vector
image_vectors = np.array(mapped_train_dataset['Image_Arrays'].tolist())  # Shape (num_samples, 244, 244, 3)

# Reshape into (num_samples, 244*244*3)
image_vectors_flattened = image_vectors.reshape(image_vectors.shape[0], -1)  # Now shape (num_samples, 178608)

scaler = StandardScaler()
image_vectors_scaled = scaler.fit_transform(image_vectors_flattened)
n_samples = image_vectors_scaled.shape[0]
image_vectors_scaled = image_vectors_scaled.reshape(n_samples, 244, 244, 3)
# Replace the old image_vector with the normalized one
mapped_train_dataset['Image_Arrays'] = list(image_vectors_scaled)
mapped_train_dataset['Image_Arrays'][0].shape

(244, 244, 3)

## Normalizing group id 

In [183]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
mapped_train_dataset['group_id_normalized'] = scaler.fit_transform(mapped_train_dataset[['group_id']])

mapped_train_dataset['group_id_normalized'].unique()

array([0.71044776, 0.90927387, 0.36763445, 0.68973348, 0.        ,
       0.18916963, 0.15697584, 0.12251362, 0.17843165, 0.87112177,
       0.03544302, 0.84974769, 0.44477138, 0.68854537, 0.86795072,
       0.53611348, 0.40086117, 0.76403933, 0.58034944, 0.71441246,
       0.10405236, 0.82363776, 0.66390429, 0.44301113, 0.57004383,
       0.78311656, 1.        , 0.4506598 , 0.6305212 , 0.25642976,
       0.73210495, 0.26130538, 0.57061123, 0.48473703, 0.12472163,
       0.31687396, 0.10951078])

## Normalizing entity number 

In [184]:

# Initialize the MinMaxScaler
scaler_entity_number = MinMaxScaler()
# Fit and transform the entity_number column
mapped_train_dataset['entity_number'] = scaler_entity_number.fit_transform(mapped_train_dataset[['entity_number']])
mapped_train_dataset['entity_number']



0     0.099937
1     0.000130
2     0.000072
3     0.000072
4     0.279950
        ...   
95    0.009731
96    0.099937
97    0.099937
98    0.003130
99    0.001530
Name: entity_number, Length: 100, dtype: float64

## Creating model 

In [185]:
def convert_image_to_array(image):
    """Helper function to convert PIL Image or other image formats into NumPy arrays."""
    if isinstance(image, np.ndarray):
        return image  # Already a NumPy array
    else:
        return np.array(image)  # Convert PIL Image to NumPy array

# Assuming 'mapped_train_dataset' is a DataFrame with necessary columns
mapped_train_dataset['Image_Arrays'] = mapped_train_dataset['Image_Arrays'].apply(convert_image_to_array)

# Ensure image data is in the shape (244, 244, 3)
mapped_train_dataset['Image_Arrays'] = mapped_train_dataset['Image_Arrays'].apply(
    lambda x: np.resize(x, (244, 244, 3)) if x.shape != (244, 244, 3) else x
)

# Prepare image data and group ID as inputs
image_data = np.array(mapped_train_dataset['Image_Arrays'].tolist())  # Converting the Image_Arrays column to a NumPy array
group_id_data = np.array(mapped_train_dataset['group_id']).reshape(-1, 1)  # Ensure correct shape
entity_value_data = np.array(mapped_train_dataset['entity_number']).reshape(-1, 1)  # Ensure correct shape

# Split the data into training and testing sets
X_img_train, X_img_test, X_group_train, X_group_test, y_train, y_test = train_test_split(
    image_data, group_id_data, entity_value_data, test_size=0.2, random_state=42
)

# Define CNN model for image feature extraction
def create_model():
    # Image input branch
    img_input = Input(shape=(244, 244, 3))  # Adjust the shape according to the resized image
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = Flatten()(x)

    # Group ID input branch
    group_input = Input(shape=(1,))  # Input shape for group_id
    y = Dense(32, activation='relu')(group_input)

    # Concatenate image features and group ID
    concatenated = Concatenate()([x, y])

    # Fully connected layers
    z = Dense(128, activation='relu')(concatenated)
    z = Dense(64, activation='relu')(z)
    
    # Final output layer (entity number prediction)
    output = Dense(1)(z)

    # Create the model
    model = Model(inputs=[img_input, group_input], outputs=output)
    return model

# Create and compile the model
model = create_model()
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit([X_img_train, X_group_train], y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss = model.evaluate([X_img_test, X_group_test], y_test)

# Make predictions
y_pred = model.predict([X_img_test, X_group_test])

print(f"Mean Squared Error on Test Set: {test_loss}")

# If needed, convert y_pred back to original scale
# y_pred_original = scaler_entity_number.inverse_transform(y_pred)

ValueError: object __array__ method not producing an array

In [186]:
print(tf.__version__)
print(np.__version__)

2.17.0
2.1.1
