In [40]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
import torch
from tqdm import tqdm
import os
from dotenv import load_dotenv
import ast 
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain


# Load variables from the .env file located in the backend folder
load_dotenv(dotenv_path='../backend/.env')

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")

if not all([PINECONE_API_KEY, INDEX_NAME]):
    raise ValueError("Please make sure you have set up your .env file in the 'backend' directory with your Pinecone credentials.")



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_community.llms.ollama import OllamaEndpointNotFoundError, _OllamaCommon


In [41]:
# --- Load Dataset ---
df = pd.read_csv("furniture_data.csv")
df['uniq_id'] = df['uniq_id'].astype(str)

In [45]:
# --- 1. LLM for Data Enrichment (Using Local Llama Model) ---
print("Setting up LLM for data enrichment using local Llama model...")
# Make sure you have Ollama installed and have run 'ollama run llama3' in your terminal
llm = ChatOllama(model="llama3", temperature=0)

prompt_template = """
You are a product catalog manager. Your task is to write a clean, consistent product description based on the structured data provided.
Standardize the information and present it in a natural, descriptive paragraph.
Use ONLY the information provided. If a piece of information (like material, color, or dimensions) is missing, do not mention it and do not invent it.

Product Data:
- Title: {title}
- Categories: {categories}
- Brand: {brand}
- Material: {material}
- Color: {color}
- Dimensions: {dimensions}

Generated Description:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["title", "categories", "brand", "material", "color", "dimensions"])
enrichment_chain = LLMChain(llm=llm, prompt=prompt)

def enrich_description(row):
    # Only enrich if the description is missing or very short (e.g., less than 5 words)
    if pd.isna(row['description']) or len(str(row['description']).split()) < 5:
        try:
            # Create a dictionary of inputs for the LLM
            inputs = {
                'title': row['title'] if pd.notna(row['title']) else '',
                'categories': row['categories'] if pd.notna(row['categories']) else '',
                'brand': row['brand'] if pd.notna(row['brand']) else '',
                'material': row['material'] if pd.notna(row['material']) else '',
                'color': row['color'] if pd.notna(row['color']) else '',
                'dimensions': row['package_dimensions'] if pd.notna(row['package_dimensions']) else ''
            }
            # Run the LLM chain
            response = enrichment_chain.run(inputs)
            return response.strip()
        except Exception as e:
            print(f"Error enriching data for {row['uniq_id']}: {e}")
            return row['description'] # Return original on error
    else:
        return row['description']

# Apply the enrichment
print("Enriching product descriptions where needed...")
df['enriched_description'] = df.apply(enrich_description, axis=1)

# --- Qualitative Validation: Show some examples ---
print("\n--- Enrichment Examples ---")
for i, row in df.head(5).iterrows():
    if row['description'] != row['enriched_description']:
        print(f"Original Description: {row['description']}")
        print(f"Enriched Description: {row['enriched_description']}\n")
print("-------------------------\n")


Setting up LLM for data enrichment using local Llama model...
Enriching product descriptions where needed...
Error enriching data for b2ede786-3f51-5a45-9a5b-bcf856958cd8: Ollama call failed with status code 500. Details: {"error":"model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details"}
Error enriching data for aba4138e-6401-52ca-a099-02e30b638db4: Ollama call failed with status code 500. Details: {"error":"model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details"}
Error enriching data for b2ede786-3f51-5a45-9a5b-bcf856958cd8: Ollama call failed with status code 500. Details: {"error":"model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details"}
Error enriching data for aba4138e-6401-52ca-a099-02e30b638db4: Ollama call failed with status code 500

KeyboardInterrupt: 

In [35]:
def get_image_embedding(image_url_str):
    """
    Parses a string that looks like a list of URLs, fetches the first one,
    and returns its embedding. Handles missing or invalid data gracefully.
    """
    # Gracefully handle missing image data (NaN values)
    if pd.isna(image_url_str) or not isinstance(image_url_str, str):
        return None
        
    try:
        # Safely evaluate the string to a Python list
        image_urls = ast.literal_eval(image_url_str)
        if not isinstance(image_urls, list) or not image_urls:
            return None
        
        # Get the first URL and strip any whitespace
        first_image_url = image_urls[0].strip()
        
        # Download and process the image
        image = Image.open(requests.get(first_image_url, stream=True).raw).convert('RGB')
        inputs = image_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = image_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    except (ValueError, SyntaxError):
        # Handle cases where the string is not a valid list literal
        print(f"Could not parse image URL string: {image_url_str}")
        return None
    except Exception as e:
        print(f"Could not process image URL {image_url_str}: {e}")
        return None



In [36]:
# --- 3. Pinecone Setup ---
print("Initializing Pinecone...")
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating index '{INDEX_NAME}'...")
    # Import ServerlessSpec right before it's used to avoid NameError
    from pinecone import ServerlessSpec
    
    # Text embedding dim (384) + Image embedding dim (768)
    pc.create_index(
        name=INDEX_NAME,
        dimension=384 + 768,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print("Index created successfully.")
else:
    print(f"Index '{INDEX_NAME}' already exists.")

index = pc.Index(INDEX_NAME)
print("Pinecone setup complete.")


Initializing Pinecone...
Index 'furniture-recommender' already exists.
Pinecone setup complete.


In [39]:
# --- 4. Generate and Upsert Embeddings ---
print("Generating and upserting embeddings to Pinecone...")
batch_size = 32

# Function to create a rich text representation for embedding
def create_text_for_embedding(row):
    parts = [
        str(row['title']) if pd.notna(row['title']) else "",
        str(row['description']) if pd.notna(row['description']) else "",
        f"Categories: {row['categories']}" if pd.notna(row['categories']) else "",
        f"Brand: {row['brand']}" if pd.notna(row['brand']) else "",
        f"Material: {row['material']}" if pd.notna(row['material']) else "",
        f"Color: {row['color']}" if pd.notna(row['color']) else "",
        f"Dimensions: {row['package_dimensions']}" if pd.notna(row['package_dimensions']) else ""
    ]
    return ". ".join(part for part in parts if part)

for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i+batch_size]
    
    # Create rich text for embeddings
    text_for_embeddings = batch.apply(create_text_for_embedding, axis=1).tolist()
    text_embeddings = text_model.encode(text_for_embeddings).tolist()
    
    # Image embeddings
    image_embeddings = [get_image_embedding(url) for url in batch['images']]
    
    # Combine and upsert
    vectors_to_upsert = []
    for idx, row in enumerate(batch.itertuples()):
        img_emb = image_embeddings[idx]
        # Only upsert if we successfully generated an image embedding
        if img_emb is not None:
            combined_embedding = text_embeddings[idx] + img_emb.tolist()
            
            # Safely get the first image for metadata
            try:
                first_image = ast.literal_eval(row.images)[0].strip()
            except:
                first_image = ''
            
            # Create richer metadata dictionary
            metadata = {
                'title': str(row.title) if pd.notna(row.title) else '',
                'description': str(row.description) if pd.notna(row.description) else '',
                'image': first_image,
                'categories': str(row.categories) if pd.notna(row.categories) else '',
                'brand': str(row.brand) if pd.notna(row.brand) else '',
                'material': str(row.material) if pd.notna(row.material) else '',
                'color': str(row.color) if pd.notna(row.color) else '',
                'package_dimensions': str(row.package_dimensions) if pd.notna(row.package_dimensions) else ''
            }
            vectors_to_upsert.append((row.uniq_id, combined_embedding, metadata))
            
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)

print("Finished upserting embeddings.")
print(index.describe_index_stats())



Generating and upserting embeddings to Pinecone...


100%|██████████| 10/10 [01:42<00:00, 10.26s/it]


Finished upserting embeddings.
{'dimension': 1152,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305,
 'vector_type': 'dense'}
