In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
import torch
from tqdm import tqdm
import os
from dotenv import load_dotenv
import ast # Import the ast module to safely evaluate string literals
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import time

# --- Configuration ---
# Load variables from the .env file located in the backend folder
load_dotenv(dotenv_path='../backend/.env')

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

if not all([PINECONE_API_KEY, INDEX_NAME, GOOGLE_API_KEY]):
    raise ValueError("Please make sure you have set up your .env file in the 'backend' directory with your Pinecone and Google API credentials.")

# --- File Paths ---
original_csv_path = "furniture_data.csv"
enriched_csv_path = "furniture_data_enriched.csv"


In [9]:
# --- Load Dataset ---
df = pd.read_csv("furniture_data.csv")
df['uniq_id'] = df['uniq_id'].astype(str)

In [12]:
# --- 1. LLM for Data Enrichment (with Save and Resume) ---
print("Setting up LLM for data enrichment using Gemini...")
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=GOOGLE_API_KEY, temperature=0, convert_system_message_to_human=True)

prompt_template = """
You are a product catalog manager. Your task is to write a clean, consistent product description based on the structured data provided.
Standardize the information and present it in a natural, descriptive paragraph.
Use ONLY the information provided. If a piece of information (like material, color, or dimensions) is missing, do not mention it and do not invent it.

Product Data:
- Title: {title}
- Categories: {categories}
- Brand: {brand}
- Material: {material}
- Color: {color}
- Dimensions: {dimensions}

Generated Description:
"""

# Corrected the typo here from prompt_prompt_template to prompt_template
prompt = PromptTemplate(template=prompt_template, input_variables=["title", "categories", "brand", "material", "color", "dimensions"])
enrichment_chain = LLMChain(llm=llm, prompt=prompt)

# Check for existing enriched data to resume progress
if os.path.exists(enriched_csv_path):
    print(f"Found existing enriched data at '{enriched_csv_path}'. Resuming...")
    enriched_df = pd.read_csv(enriched_csv_path)
    # Merge to ensure we have all original columns, preserving the enriched descriptions
    df = pd.merge(df.drop(columns=['description'], errors='ignore'), enriched_df, on='uniq_id', how='left', suffixes=('', '_old'))
else:
    print("No existing enriched data found. Starting from scratch.")
    df['enriched_description'] = df['description'] # Start with original description

# Identify rows that still need enrichment
rows_to_enrich = df[df['enriched_description'].isna() | (df['enriched_description'].str.split().str.len() < 5)]
print(f"Found {len(rows_to_enrich)} rows to enrich.")

if not rows_to_enrich.empty:
    print("Enriching product descriptions where needed...")
    for index, row in tqdm(rows_to_enrich.iterrows(), total=len(rows_to_enrich)):
        try:
            inputs = {
                'title': row['title'] if pd.notna(row['title']) else '',
                'categories': row['categories'] if pd.notna(row['categories']) else '',
                'brand': row['brand'] if pd.notna(row['brand']) else '',
                'material': row['material'] if pd.notna(row['material']) else '',
                'color': row['color'] if pd.notna(row['color']) else '',
                'dimensions': row['package_dimensions'] if pd.notna(row['package_dimensions']) else ''
            }
            response = enrichment_chain.run(inputs)
            df.loc[index, 'enriched_description'] = response.strip()
            time.sleep(5) # Adhere to rate limits

            # Save progress periodically (e.g., after every 5 items)
            if (index + 1) % 5 == 0:
                df.to_csv(enriched_csv_path, index=False)
                # print(f"Progress saved at item {index+1}") # Optional: uncomment for verbose progress

        except Exception as e:
            print(f"\nAn error occurred while enriching data for {row['uniq_id']}: {e}")
            print("Stopping enrichment process. Saving current progress.")
            break # Exit the loop on error (like a rate limit)
    
    # Final save at the end of the process
    df.to_csv(enriched_csv_path, index=False)
    print("Enrichment process finished. Final results saved.")

# --- Qualitative Validation: Show some examples ---
print("\n--- Final Data Examples (with enriched descriptions) ---")
print(df[['description', 'enriched_description']].head())
print("-------------------------------------------------------\n")

E0000 00:00:1760717598.440868 3034157 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1760717598.441687 3034157 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Setting up LLM for data enrichment using Gemini...
No existing enriched data found. Starting from scratch.
Found 156 rows to enrich.
Enriching product descriptions where needed...


100%|██████████| 156/156 [17:20<00:00,  6.67s/it]

Enrichment process finished. Final results saved.

--- Final Data Examples (with enriched descriptions) ---
                                         description  \
0  multiple shoes, coats, hats, and other items E...   
1                     subrtex Dining chairs Set of 2   
2                                                NaN   
3  The decorative doormat features a subtle textu...   
4  Set of Four Folding Trays With Matching Storag...   

                                enriched_description  
0  multiple shoes, coats, hats, and other items E...  
1                     subrtex Dining chairs Set of 2  
2  The MUYETOL Plant Repotting Mat is a portable ...  
3  The decorative doormat features a subtle textu...  
4  Set of Four Folding Trays With Matching Storag...  
-------------------------------------------------------






In [13]:
# --- 2. Text Embedding Model ---
print("Loading text embedding model...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- 3. Image Embedding Model ---
print("Loading image embedding model...")
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
device = "cuda" if torch.cuda.is_available() else "cpu"
image_model.to(device)


# --- Function to get Image Embeddings ---
def get_image_embedding(image_url_str):
    if pd.isna(image_url_str) or not isinstance(image_url_str, str): return None
    try:
        image_urls = ast.literal_eval(image_url_str)
        if not isinstance(image_urls, list) or not image_urls: return None
        first_image_url = image_urls[0].strip()
        image = Image.open(requests.get(first_image_url, stream=True).raw).convert('RGB')
        inputs = image_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = image_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    except Exception as e:
        print(f"Could not process image URL {image_url_str}: {e}")
        return None


Loading text embedding model...




Loading image embedding model...


In [16]:
# --- 3. Pinecone Setup ---
print("Initializing Pinecone...")
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating index '{INDEX_NAME}'...")
    # Import ServerlessSpec right before it's used to avoid NameError
    from pinecone import ServerlessSpec
    
    # Text embedding dim (384) + Image embedding dim (768)
    pc.create_index(
        name=INDEX_NAME,
        dimension=384 + 768,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print("Index created successfully.")
else:
    print(f"Index '{INDEX_NAME}' already exists.")

index = pc.Index(INDEX_NAME)
print("Pinecone setup complete.")

Initializing Pinecone...
Index 'furniture-recommender' already exists.
Pinecone setup complete.


In [17]:
# --- 5. Generate and Upsert Embeddings ---
print("Generating and upserting embeddings to Pinecone...")
batch_size = 32

for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i+batch_size]
    
    # Create text for embeddings from the title and the new ENRICHED description
    text_for_embeddings = (batch['title'].fillna('') + ". " + batch['enriched_description'].fillna('')).tolist()
    text_embeddings = text_model.encode(text_for_embeddings).tolist()
    
    image_embeddings = [get_image_embedding(url) for url in batch['images']]
    
    vectors_to_upsert = []
    for idx, row in enumerate(batch.itertuples()):
        img_emb = image_embeddings[idx]
        if img_emb is not None:
            combined_embedding = text_embeddings[idx] + img_emb.tolist()
            try:
                first_image = ast.literal_eval(row.images)[0].strip()
            except:
                first_image = ''
            
            # Use the enriched description in the metadata
            metadata = {
                'title': str(row.title) if pd.notna(row.title) else '',
                'description': str(row.enriched_description) if pd.notna(row.enriched_description) else '',
                'image': first_image,
                'categories': str(row.categories) if pd.notna(row.categories) else '',
                'brand': str(row.brand) if pd.notna(row.brand) else '',
                'material': str(row.material) if pd.notna(row.material) else '',
                'color': str(row.color) if pd.notna(row.color) else '',
                'package_dimensions': str(row.package_dimensions) if pd.notna(row.package_dimensions) else ''
            }
            vectors_to_upsert.append((row.uniq_id, combined_embedding, metadata))
            
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)

print("Finished upserting embeddings.")
print(index.describe_index_stats())




Generating and upserting embeddings to Pinecone...


100%|██████████| 10/10 [04:33<00:00, 27.33s/it]


Finished upserting embeddings.
{'dimension': 1152,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305}
