In [18]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
import torch
from tqdm import tqdm
import os
from dotenv import load_dotenv
import ast
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# --- Configuration ---
load_dotenv(dotenv_path='../backend/.env')

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")

# --- File Paths & Data Loading ---
original_csv_path = "furniture_data.csv"
enriched_csv_path = "furniture_data_enriched.csv"

In [19]:
# --- Load Dataset ---
df = pd.read_csv("furniture_data.csv")
df['uniq_id'] = df['uniq_id'].astype(str)

In [20]:
df['price_cleaned'] = df['price'].str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['price_cleaned'] = pd.to_numeric(df['price_cleaned'], errors='coerce')


In [21]:
# --- 1. LLM for Data Enrichment (Using Ollama) ---
print("Setting up LLM for data enrichment using Ollama (phi3:mini)...")
# Using a local model to avoid rate limits
llm = ChatOllama(model="phi3:mini", temperature=0)

prompt_template = """
You are a product catalog manager. Your task is to write a clean, consistent product description based on the structured data provided.
Standardize the information and present it in a natural, descriptive paragraph.
Use ONLY the information provided. If a piece of information (like material, color, or dimensions) is missing, do not mention it and do not invent it.

Product Data:
- Title: {title}
- Categories: {categories}
- Brand: {brand}
- Material: {material}
- Color: {color}
- Dimensions: {dimensions}

Generated Description:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["title", "categories", "brand", "material", "color", "dimensions"])
enrichment_chain = LLMChain(llm=llm, prompt=prompt)

# --- Save and Resume Logic ---
if os.path.exists(enriched_csv_path):
    print(f"Found existing enriched data at '{enriched_csv_path}'. Resuming...")
    enriched_df = pd.read_csv(enriched_csv_path)
    df = pd.merge(df, enriched_df[['uniq_id', 'enriched_description']], on='uniq_id', how='left')
else:
    print("No existing enriched data found. Starting from scratch.")
    df['enriched_description'] = df['description']

rows_to_enrich = df[df['enriched_description'].isna() | (df['enriched_description'].str.split().str.len() < 5)]
print(f"Found {len(rows_to_enrich)} rows to enrich.")

if not rows_to_enrich.empty:
    print("Enriching product descriptions where needed...")
    for index, row in tqdm(rows_to_enrich.iterrows(), total=len(rows_to_enrich)):
        try:
            inputs = {
                'title': row['title'] if pd.notna(row['title']) else '',
                'categories': row['categories'] if pd.notna(row['categories']) else '',
                'brand': row['brand'] if pd.notna(row['brand']) else '',
                'material': row['material'] if pd.notna(row['material']) else '',
                'color': row['color'] if pd.notna(row['color']) else '',
                'dimensions': row['package_dimensions'] if pd.notna(row['package_dimensions']) else ''
            }
            response = enrichment_chain.run(inputs)
            df.loc[index, 'enriched_description'] = response.strip()
            
            if (index + 1) % 10 == 0:
                df.to_csv(enriched_csv_path, index=False)
        except Exception as e:
            print(f"\nAn error occurred while enriching data for {row['uniq_id']}: {e}")
            print("Stopping enrichment process. Saving current progress.")
            break
    
    df.to_csv(enriched_csv_path, index=False)
    print("Enrichment process finished. Final results saved.")


Setting up LLM for data enrichment using Ollama (phi3:mini)...
Found existing enriched data at 'furniture_data_enriched.csv'. Resuming...
Found 0 rows to enrich.


In [22]:
# --- Model Loading ---
print("Loading text and image models...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
device = "cuda" if torch.cuda.is_available() else "cpu"
image_model.to(device)

def get_image_embedding(image_url_str):
    if pd.isna(image_url_str): return None
    try:
        image_urls = ast.literal_eval(image_url_str)
        if not image_urls: return None
        first_image_url = image_urls[0].strip()
        image = Image.open(requests.get(first_image_url, stream=True).raw).convert('RGB')
        inputs = image_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = image_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    except Exception as e:
        print(f"Could not process image URL {image_url_str}: {e}")
        return None

Loading text and image models...




In [23]:
# --- 3. Pinecone Setup ---
print("Initializing Pinecone...")
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating index '{INDEX_NAME}'...")
    # Import ServerlessSpec right before it's used to avoid NameError
    from pinecone import ServerlessSpec
    
    # Text embedding dim (384) + Image embedding dim (768)
    pc.create_index(
        name=INDEX_NAME,
        dimension=384 + 768,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print("Index created successfully.")
else:
    print(f"Index '{INDEX_NAME}' already exists.")

index = pc.Index(INDEX_NAME)
print("Pinecone setup complete.")

Initializing Pinecone...
Index 'furniture-recommender' already exists.
Pinecone setup complete.


In [None]:
# --- Generate and Upsert Embeddings ---
print("Generating and upserting embeddings to Pinecone...")
batch_size = 32

for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i+batch_size]
    
    # Create text for embeddings from the title and the new ENRICHED description
    text_for_embeddings = (batch['title'].fillna('') + ". " + batch['enriched_description'].fillna('')).tolist()
    text_embeddings = text_model.encode(text_for_embeddings).tolist()
    
    image_embeddings = [get_image_embedding(url) for url in batch['images']]
    
    vectors_to_upsert = []
    for idx, row in enumerate(batch.itertuples()):
        img_emb = image_embeddings[idx]
        if img_emb is not None:
            combined_embedding = text_embeddings[idx] + img_emb.tolist()
            try:
                first_image = ast.literal_eval(row.images)[0].strip()
            except:
                first_image = ''
            price_str = f"${row.price_cleaned:.2f}" if pd.notna(row.price_cleaned) else "Price not available"
            # Use the enriched description in the metadata
            metadata = {
                'title': str(row.title) if pd.notna(row.title) else '',
                'description': str(row.enriched_description) if pd.notna(row.enriched_description) else '',
                'image': first_image,
                'price': price_str,
                'categories': str(row.categories) if pd.notna(row.categories) else '',
                'brand': str(row.brand) if pd.notna(row.brand) else '',
                'material': str(row.material) if pd.notna(row.material) else '',
                'color': str(row.color) if pd.notna(row.color) else '',
                'package_dimensions': str(row.package_dimensions) if pd.notna(row.package_dimensions) else ''
            }
            vectors_to_upsert.append((row.uniq_id, combined_embedding, metadata))
            
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)

print("Finished upserting embeddings.")
print(index.describe_index_stats())




Generating and upserting embeddings to Pinecone...


100%|██████████| 11/11 [01:47<00:00,  9.78s/it]


Finished upserting embeddings.
{'dimension': 1152,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305}
