In [25]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
import torch
from tqdm import tqdm
import os
from dotenv import load_dotenv
import ast
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import numpy as np
from sklearn.neighbors import NearestNeighbors

# --- Configuration & Initialization ---
load_dotenv(dotenv_path='../backend/.env')
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")


In [26]:
# --- 1. Data Loading and Initial Cleaning ---
print("Loading data from furniture_data.csv...")
df = pd.read_csv("furniture_data.csv")
df['uniq_id'] = df['uniq_id'].astype(str)

# Clean price column
df['price_cleaned'] = df['price'].str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['price_cleaned'] = pd.to_numeric(df['price_cleaned'], errors='coerce')


Loading data from furniture_data.csv...


In [27]:
# --- 2. Load AI Models ---
print("Loading text and image embedding models...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
device = "cuda" if torch.cuda.is_available() else "cpu"
image_model.to(device)

def get_image_embedding(image_url_str):
    if pd.isna(image_url_str): return None
    try:
        image_urls = ast.literal_eval(image_url_str)
        if not isinstance(image_urls, list) or not image_urls: return None
        first_image_url = image_urls[0].strip()
        image = Image.open(requests.get(first_image_url, stream=True, timeout=5).raw).convert('RGB')
        inputs = image_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = image_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    except Exception:
        return None


Loading text and image embedding models...




In [28]:
# --- 3. Generate Base Embeddings for Imputation ---
print("Generating base text and image embeddings for all products...")
df['text_embedding'] = text_model.encode(
    (df['title'].fillna('') + ". " + df['description'].fillna('')).tolist(),
    show_progress_bar=True
).tolist()

df['image_embedding'] = [get_image_embedding(url) for url in tqdm(df['images'], desc="Generating Image Embeddings")]

df['combined_embedding'] = df.apply(
    lambda row: np.concatenate([row['text_embedding'], row['image_embedding']]).tolist() if row['image_embedding'] is not None else row['text_embedding'],
    axis=1
)

valid_embeddings = [emb for emb in df['combined_embedding'] if emb is not None]
if not valid_embeddings:
    raise ValueError("No valid embeddings could be generated. Check data and model loading.")

max_dim = max(len(emb) for emb in valid_embeddings)

embedding_matrix = []
for emb in df['combined_embedding']:
    if emb is None:
        embedding_matrix.append(np.zeros(max_dim))
        continue
    if len(emb) < max_dim:
        padding = np.zeros(max_dim - len(emb))
        emb = np.concatenate([emb, padding])
    embedding_matrix.append(emb)

embedding_matrix = np.array(embedding_matrix)


Generating base text and image embeddings for all products...


Batches: 100%|██████████| 10/10 [00:02<00:00,  3.62it/s]
Generating Image Embeddings: 100%|██████████| 312/312 [01:33<00:00,  3.34it/s]


In [29]:
# --- 4. Predictive Imputation for Missing Values ---
print("Predicting missing values using KNN based on embeddings...")
# Expanded list of columns to impute, now including price
impute_cols = ['brand', 'material', 'color', 'manufacturer', 'package_dimensions', 'country_of_origin', 'price_cleaned']
df_imputed = df.copy()

n_neighbors = 5
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
knn.fit(embedding_matrix)

for col in impute_cols:
    missing_indices = df_imputed[df_imputed[col].isna()].index
    print(f"Found {len(missing_indices)} missing values in '{col}'. Imputing...")
    
    if len(missing_indices) > 0:
        distances, indices = knn.kneighbors(embedding_matrix[missing_indices])
        
        for i, missing_idx in enumerate(missing_indices):
            neighbor_indices = indices[i]
            neighbor_values = df_imputed.iloc[neighbor_indices][col].dropna()
            
            if not neighbor_values.empty:
                # Use median for numerical columns like price, and mode for categorical columns
                if col == 'price_cleaned':
                    imputed_value = neighbor_values.median()
                else:
                    imputed_value = neighbor_values.mode()[0]
                df_imputed.loc[missing_idx, col] = imputed_value

df = df_imputed

Predicting missing values using KNN based on embeddings...
Found 0 missing values in 'brand'. Imputing...
Found 94 missing values in 'material'. Imputing...
Found 47 missing values in 'color'. Imputing...
Found 107 missing values in 'manufacturer'. Imputing...
Found 6 missing values in 'package_dimensions'. Imputing...
Found 187 missing values in 'country_of_origin'. Imputing...
Found 97 missing values in 'price_cleaned'. Imputing...


In [30]:
# --- 5. LLM for Data Enrichment ---
print("Enriching descriptions with LLM (Ollama phi3:mini)...")
llm = ChatOllama(model="phi3:mini", temperature=0)
prompt_template = """
You are a product catalog manager. Your task is to write a clean, consistent product description based on the structured data provided.
Standardize the information and present it in a natural, descriptive paragraph.
Use ONLY the information provided. If a piece of information is missing, do not mention it.

Product Data:
- Title: {title}
- Price: {price}
- Categories: {categories}
- Brand: {brand}
- Material: {material}
- Color: {color}
- Dimensions: {dimensions}

Generated Description:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["title", "price", "categories", "brand", "material", "color", "dimensions"])
enrichment_chain = LLMChain(llm=llm, prompt=prompt)

df['enriched_description'] = df['description']
rows_to_enrich = df[df['enriched_description'].isna() | (df['enriched_description'].str.split().str.len() < 5)]

if not rows_to_enrich.empty:
    for index, row in tqdm(rows_to_enrich.iterrows(), total=len(rows_to_enrich), desc="Enriching Descriptions"):
        price_str = f"${row['price_cleaned']:.2f}" if pd.notna(row['price_cleaned']) else "Not specified"
        inputs = {
            'title': row['title'],
            'price': price_str,
            'categories': row['categories'],
            'brand': row['brand'], 
            'material': row['material'],
            'color': row['color'], 
            'dimensions': row['package_dimensions']
        }
        inputs = {k: v for k, v in inputs.items() if pd.notna(v)}
        response = enrichment_chain.run(inputs)
        df.loc[index, 'enriched_description'] = response.strip()


Enriching descriptions with LLM (Ollama phi3:mini)...


Enriching Descriptions:  19%|█▉        | 30/156 [10:22<43:34, 20.75s/it]


KeyboardInterrupt: 

In [None]:
# --- 6. Pinecone Setup & Upsert ---
print("Initializing Pinecone and upserting data...")
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(name=INDEX_NAME, dimension=max_dim, metric='cosine', spec=ServerlessSpec(cloud='aws', region='us-east-1'))
index = pc.Index(INDEX_NAME)

batch_size = 32
for i in tqdm(range(0, len(df), batch_size), desc="Upserting to Pinecone"):
    batch = df.iloc[i:i+batch_size]
    vectors_to_upsert = []
    
    for _, row in batch.iterrows():
        final_embedding = embedding_matrix[row.name].tolist()
        
        try:
            first_image = ast.literal_eval(row['images'])[0].strip() if pd.notna(row['images']) else ''
        except:
            first_image = ''
            
        price_str = f"${row['price_cleaned']:.2f}" if pd.notna(row['price_cleaned']) else "Price not available"
        
        metadata = {
            'title': str(row['title']) if pd.notna(row['title']) else '',
            'description': str(row['enriched_description']) if pd.notna(row['enriched_description']) else '',
            'image': first_image,
            'price': price_str,
            'categories': str(row['categories']) if pd.notna(row['categories']) else '',
            'brand': str(row['brand']) if pd.notna(row['brand']) else '',
            'material': str(row['material']) if pd.notna(row['material']) else '',
            'color': str(row['color']) if pd.notna(row['color']) else '',
            'package_dimensions': str(row['package_dimensions']) if pd.notna(row['package_dimensions']) else ''
        }
        
        vectors_to_upsert.append((row['uniq_id'], final_embedding, metadata))
            
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)

print("Finished upserting all data to Pinecone.")
print(index.describe_index_stats())



Generating and upserting embeddings to Pinecone...


100%|██████████| 11/11 [01:47<00:00,  9.78s/it]


Finished upserting embeddings.
{'dimension': 1152,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305}
