In [3]:
import pandas_gbq
import pandas as pd
from vertexai.vision_models import Image, MultiModalEmbeddingModel
import pickle
import logging

logger = logging.getLogger(__name__)

# Function to initialize the multimodal model
def get_multimodal_model():
    return MultiModalEmbeddingModel.from_pretrained("multimodalembedding")

# Function to generate embeddings for an image URI
def fetch_image_embeddings(image_uri: str) -> list:
    try:
        # Load the image from the given URI (from Google Cloud Storage)
        image = Image.load_from_file(image_uri)

        # Get embeddings using the multimodal model
        model = get_multimodal_model()
        embeddings = model.get_embeddings(image=image, dimension=1408)

        return embeddings.image_embedding
    except Exception as e:
        logger.error(f"Error generating embeddings for {image_uri}: {e}")
        return None

# Function to create image embeddings for the entire dataframe and save to a .pkl file
def create_image_embeddings(dataframe, output_file):
    embeddings_list = []

    for idx, row in dataframe.iterrows():
        sku_config = row['sku_config']
        image_uri = row['uri']  # Assuming 'uri' holds the GCS image path

        try:
            # Generate the embeddings for the image URI
            image_embeddings = fetch_image_embeddings(image_uri)

            if image_embeddings:
                # Append to the list as a dictionary
                embeddings_list.append({
                    'sku_config': sku_config,
                    'uri': image_uri,
                    'image_embeddings': image_embeddings
                })

            logger.info(f"Processed {sku_config} successfully")

        except Exception as e:
            logger.error(f"Error processing {sku_config}: {str(e)}")
            continue

    # Convert the list of dictionaries into a DataFrame
    embeddings_df = pd.DataFrame(embeddings_list)

    # Save the embeddings dataframe to a .pkl file
    embeddings_df.to_pickle(output_file)
    logger.info(f"Embeddings saved to {output_file}")

# Main function to read data from BigQuery, process images, and save embeddings
def main():
    # Read data from BigQuery
    query = """
    SELECT sku_config, image_url, uri FROM `noonbigmerchsandbox.vamsi.sku_uri_mob_access` 
    WHERE rn <= 10
    """
    bq_project = 'noon-chatbot'
    df = pandas_gbq.read_gbq(query, project_id=bq_project)

    # Define the output file for storing embeddings
    output_file = 'image_embeddings.pkl'

    # Generate embeddings and store them
    create_image_embeddings(df, output_file)

# Execute the main function
if __name__ == "__main__":
    main()


Downloading: 100%|[32m██████████[0m|


I0000 00:00:1726600711.867634 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600713.425546 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600713.425836 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600715.518287 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600716.928739 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600716.929873 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600719.071005 3480124 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1726600719.577595 3480124 check_gcp_

In [4]:
import pickle
with open('image_embeddings.pkl', 'rb') as f:
    df = pickle.load(f)

In [5]:
df.head()

Unnamed: 0,sku_config,uri,image_embeddings
0,N11083426A,gs://noon-cdn-images/original/v1502960023/N110...,"[0.0354851522, -0.0136427293, -0.0432445146, -..."
1,N11083450A,gs://noon-cdn-images/original/v1502960033/N110...,"[0.0254504904, -0.00014870899, -0.030649798, -..."
2,N11083404A,gs://noon-cdn-images/original/v1502960019/N110...,"[0.0198820084, -0.0099657746, -0.0321889669, -..."
3,N11092925A,gs://noon-cdn-images/original/v1663854058/N110...,"[0.0570469797, -0.013553923, 0.00194181805, -0..."
4,N11083446A,gs://noon-cdn-images/original/v1502960094/N110...,"[0.018271124, -0.0135514438, -0.0215694942, -0..."


In [10]:
len(df['image_embeddings'].iloc[0])

1408