In [1]:
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

GPU Available: True
GPU Device: Tesla T4
GPU Memory: 15.83 GB


In [2]:
!pip install -q --upgrade requests>=2.32.5

# Install main packages
!pip install -q sentence-transformers transformers pillow pandas python-dotenv scikit-learn tqdm

# Install langchain packages
!pip install -q langchain langchain-community

# Install Pinecone
!pip install -q pinecone

# Torch is usually pre-installed in Colab, but upgrade if needed
!pip install -q --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

print("✅ All packages installed successfully!")

✅ All packages installed successfully!


In [3]:
from google.colab import files
print("Upload your furniture_data.csv file:")
uploaded = files.upload()


Upload your furniture_data.csv file:


Saving furniture_data.csv to furniture_data (1).csv


In [4]:
import os
# Uncomment and set your API keys
os.environ['PINECONE_API_KEY'] = 'pcsk_4NbMZW_7HXpJj6zrH8A2YzJF2cbyKm3sWkbU2z8ZSZFBAiKbqQnBmK8hHL1KuzfHpADrHv'
os.environ['INDEX_NAME'] = 'furniture-recommender'

In [5]:
os.environ['GOOGLE_API_KEY'] = 'AIzaSyA9AurCVoeWz3xcJKzXmj7J9K-bPfzdQVo'

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
import torch
import torch.cuda
from tqdm.auto import tqdm
import os
from dotenv import load_dotenv
import ast
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import gc

# --- Configuration & GPU Optimization ---
print("=" * 60)
print("CONFIGURATION & GPU SETUP")
print("=" * 60)

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Available GPU Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")

    # Enable TF32 for better performance on Ampere GPUs
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # Clear GPU cache
    torch.cuda.empty_cache()
    gc.collect()

# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")

if not PINECONE_API_KEY or not INDEX_NAME:
    print("\n⚠️  WARNING: Environment variables not set!")
    print("Please set PINECONE_API_KEY and INDEX_NAME")


CONFIGURATION & GPU SETUP
Using device: cuda
GPU Name: Tesla T4
Total GPU Memory: 15.83 GB
Available GPU Memory: 0.00 GB allocated


In [7]:
# --- 1. Data Loading and Initial Cleaning ---
print("\n" + "=" * 60)
print("STEP 1: DATA LOADING")
print("=" * 60)

df = pd.read_csv("furniture_data.csv")
print(f"Loaded {len(df)} products")
df['uniq_id'] = df['uniq_id'].astype(str)

# Clean price column
df['price_cleaned'] = df['price'].str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['price_cleaned'] = pd.to_numeric(df['price_cleaned'], errors='coerce')
print(f"Price cleaned: {df['price_cleaned'].notna().sum()} valid prices")



STEP 1: DATA LOADING
Loaded 312 products
Price cleaned: 215 valid prices


In [8]:
print("STEP 2: LOADING AI MODELS")
print("=" * 60)

print("Loading text embedding model (all-MiniLM-L6-v2)...")
text_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("Loading image embedding model (ViT)...")
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
image_model.to(device)
image_model.eval()  # Set to evaluation mode

if device == "cuda":
    print(f"GPU Memory after loading models: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

def get_image_embedding(image_url_str):
    """Generate image embedding with GPU acceleration and error handling"""
    if pd.isna(image_url_str):
        return None
    try:
        image_urls = ast.literal_eval(image_url_str)
        if not isinstance(image_urls, list) or not image_urls:
            return None

        first_image_url = image_urls[0].strip()
        response = requests.get(first_image_url, stream=True, timeout=10)
        image = Image.open(response.raw).convert('RGB')

        inputs = image_processor(images=image, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = image_model(**inputs)

        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

        # Clear GPU cache periodically
        if device == "cuda" and torch.cuda.memory_allocated(0) / 1e9 > 10:
            torch.cuda.empty_cache()

        return embedding

    except Exception as e:
        return None


STEP 2: LOADING AI MODELS
Loading text embedding model (all-MiniLM-L6-v2)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading image embedding model (ViT)...
GPU Memory after loading models: 0.44 GB


In [9]:
print("\n" + "=" * 60)
print("STEP 3: GENERATING EMBEDDINGS")
print("=" * 60)

print("Generating text embeddings...")
text_descriptions = (df['title'].fillna('') + ". " + df['description'].fillna('')).tolist()

# Process in batches for better GPU utilization
batch_size = 32
text_embeddings = []
for i in tqdm(range(0, len(text_descriptions), batch_size), desc="Text Embeddings"):
    batch = text_descriptions[i:i+batch_size]
    batch_embeddings = text_model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
    text_embeddings.extend(batch_embeddings.tolist())

    # Clear GPU cache
    if device == "cuda" and i % 320 == 0:
        torch.cuda.empty_cache()

df['text_embedding'] = text_embeddings

if device == "cuda":
    print(f"GPU Memory after text embeddings: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

print("Generating image embeddings...")
image_embeddings = []
for url in tqdm(df['images'], desc="Image Embeddings"):
    emb = get_image_embedding(url)
    image_embeddings.append(emb)

df['image_embedding'] = image_embeddings

if device == "cuda":
    print(f"GPU Memory after image embeddings: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    torch.cuda.empty_cache()

# Combine embeddings
print("Combining text and image embeddings...")
df['combined_embedding'] = df.apply(
    lambda row: np.concatenate([row['text_embedding'], row['image_embedding']]).tolist()
    if row['image_embedding'] is not None
    else row['text_embedding'],
    axis=1
)

# Prepare embedding matrix with padding
valid_embeddings = [emb for emb in df['combined_embedding'] if emb is not None]
if not valid_embeddings:
    raise ValueError("No valid embeddings generated!")

max_dim = max(len(emb) for emb in valid_embeddings)
print(f"Embedding dimension: {max_dim}")

embedding_matrix = []
for emb in df['combined_embedding']:
    if emb is None:
        embedding_matrix.append(np.zeros(max_dim))
        continue
    if len(emb) < max_dim:
        padding = np.zeros(max_dim - len(emb))
        emb = np.concatenate([emb, padding])
    embedding_matrix.append(emb)

embedding_matrix = np.array(embedding_matrix, dtype=np.float32)
print(f"Embedding matrix shape: {embedding_matrix.shape}")



STEP 3: GENERATING EMBEDDINGS
Generating text embeddings...


Text Embeddings:   0%|          | 0/10 [00:00<?, ?it/s]

GPU Memory after text embeddings: 0.44 GB
Generating image embeddings...


Image Embeddings:   0%|          | 0/312 [00:00<?, ?it/s]

GPU Memory after image embeddings: 0.44 GB
Combining text and image embeddings...
Embedding dimension: 1152
Embedding matrix shape: (312, 1152)


In [10]:
# --- 4. Predictive Imputation for Missing Values ---
print("\n" + "=" * 60)
print("STEP 4: IMPUTING MISSING VALUES")
print("=" * 60)

df_imputed = df.copy()

# Separate categorical and numerical imputation
categorical_cols = ['brand', 'material', 'color', 'manufacturer', 'package_dimensions', 'country_of_origin']
n_neighbors = 5
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', n_jobs=-1)
knn.fit(embedding_matrix)



STEP 4: IMPUTING MISSING VALUES


In [11]:
# 4.1: Impute categorical columns using KNN mode
print("\n--- Categorical Imputation ---")
for col in categorical_cols:
    missing_indices = df_imputed[df_imputed[col].isna()].index
    if len(missing_indices) == 0:
        print(f"✓ {col}: No missing values")
        continue

    print(f"Imputing {len(missing_indices)} missing values in '{col}'...")

    distances, indices = knn.kneighbors(embedding_matrix[missing_indices])

    for i, missing_idx in enumerate(missing_indices):
        neighbor_indices = indices[i]
        neighbor_values = df_imputed.iloc[neighbor_indices][col].dropna()

        if not neighbor_values.empty:
            imputed_value = neighbor_values.mode()[0]
            df_imputed.loc[missing_idx, col] = imputed_value



--- Categorical Imputation ---
✓ brand: No missing values
Imputing 94 missing values in 'material'...
Imputing 47 missing values in 'color'...
Imputing 107 missing values in 'manufacturer'...
Imputing 6 missing values in 'package_dimensions'...
Imputing 187 missing values in 'country_of_origin'...


In [12]:
# 4.2: Advanced Price Prediction using Machine Learning
print("\n--- Advanced Price Prediction ---")

# Identify rows with and without prices
price_available = df_imputed['price_cleaned'].notna()
price_missing = df_imputed['price_cleaned'].isna()

n_missing = price_missing.sum()
n_available = price_available.sum()

print(f"Products with prices: {n_available}")
print(f"Products needing price prediction: {n_missing}")

if n_missing > 0 and n_available >= 20:  # Need at least 20 samples to train
    print("\nTraining price prediction model...")

    # Prepare features and target
    X_train = embedding_matrix[price_available]
    y_train = df_imputed.loc[price_available, 'price_cleaned'].values
    X_predict = embedding_matrix[price_missing]

    # Add category-based features for better prediction
    def extract_category_features(df_subset):
        """Extract numerical features from categories"""
        features = []
        for idx in df_subset.index:
            row_features = []

            # Category indicators (encoded as 0/1)
            categories_str = str(df_subset.loc[idx, 'categories']).lower()
            category_keywords = ['furniture', 'bedroom', 'living', 'outdoor', 'office',
                               'dining', 'kitchen', 'storage', 'lighting', 'decor']
            for keyword in category_keywords:
                row_features.append(1 if keyword in categories_str else 0)

            # Brand presence
            row_features.append(1 if pd.notna(df_subset.loc[idx, 'brand']) else 0)

            # Material indicators
            material_str = str(df_subset.loc[idx, 'material']).lower()
            material_keywords = ['wood', 'metal', 'fabric', 'leather', 'glass', 'plastic']
            for keyword in material_keywords:
                row_features.append(1 if keyword in material_str else 0)

            features.append(row_features)

        return np.array(features)

    # Extract additional features
    category_features_train = extract_category_features(df_imputed[price_available])
    category_features_predict = extract_category_features(df_imputed[price_missing])

    # Combine embeddings with category features
    X_train_combined = np.hstack([X_train, category_features_train])
    X_predict_combined = np.hstack([X_predict, category_features_predict])

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_combined)
    X_predict_scaled = scaler.transform(X_predict_combined)

    # Log-transform prices for better distribution (many prices are right-skewed)
    y_train_log = np.log1p(y_train)

    # Try multiple models and use the best one
    models = {
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42, learning_rate=0.1),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
        'Ridge Regression': Ridge(alpha=1.0)
    }

    best_model = None
    best_score = -float('inf')
    best_model_name = None

    # Evaluate models using cross-validation
    if n_available >= 50:  # Only do train-test split if we have enough data
        X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_train_log, test_size=0.2, random_state=42)

        for name, model in models.items():
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_val)

            # Convert back from log scale
            y_pred_original = np.expm1(y_pred)
            y_val_original = np.expm1(y_val)

            mae = mean_absolute_error(y_val_original, y_pred_original)
            r2 = r2_score(y_val_original, y_pred_original)

            print(f"  {name}: MAE=${mae:.2f}, R²={r2:.3f}")

            if r2 > best_score:
                best_score = r2
                best_model = model
                best_model_name = name
    else:
        # If not enough data for validation, just use Gradient Boosting
        best_model = models['Gradient Boosting']
        best_model_name = 'Gradient Boosting'

    # Train final model on all available data
    print(f"\nUsing {best_model_name} for final predictions...")
    best_model.fit(X_train_scaled, y_train_log)

    # Predict missing prices
    y_predicted_log = best_model.predict(X_predict_scaled)
    y_predicted = np.expm1(y_predicted_log)

    # Apply business logic constraints
    # Use KNN to get price range from similar products
    distances, indices = knn.kneighbors(X_predict)

    for i, missing_idx in enumerate(df_imputed[price_missing].index):
        predicted_price = y_predicted[i]

        # Get neighbor prices for sanity check
        neighbor_indices = indices[i]
        neighbor_prices = df_imputed.iloc[neighbor_indices]['price_cleaned'].dropna()

        if not neighbor_prices.empty:
            # Ensure predicted price is within reasonable range of neighbors
            min_neighbor = neighbor_prices.min()
            max_neighbor = neighbor_prices.max()
            median_neighbor = neighbor_prices.median()

            # If prediction is too far from neighbors, adjust it
            if predicted_price < min_neighbor * 0.5:
                predicted_price = min_neighbor * 0.7
            elif predicted_price > max_neighbor * 2:
                predicted_price = max_neighbor * 1.5

            # Blend with neighbor median for stability (70% model, 30% neighbors)
            predicted_price = 0.7 * predicted_price + 0.3 * median_neighbor

        # Ensure minimum reasonable price
        predicted_price = max(predicted_price, 5.0)

        df_imputed.loc[missing_idx, 'price_cleaned'] = predicted_price

    print(f"✓ Predicted prices for {n_missing} products")
    print(f"  Price range: ${y_predicted.min():.2f} - ${y_predicted.max():.2f}")
    print(f"  Mean predicted price: ${y_predicted.mean():.2f}")

elif n_missing > 0:
    # Fallback to KNN median if not enough training data
    print("\n⚠️  Not enough price data for ML model. Using KNN median fallback...")
    distances, indices = knn.kneighbors(embedding_matrix[price_missing])

    for i, missing_idx in enumerate(df_imputed[price_missing].index):
        neighbor_indices = indices[i]
        neighbor_prices = df_imputed.iloc[neighbor_indices]['price_cleaned'].dropna()

        if not neighbor_prices.empty:
            imputed_value = neighbor_prices.median()
            df_imputed.loc[missing_idx, 'price_cleaned'] = imputed_value

    print(f"✓ Imputed prices using KNN median")

df = df_imputed



--- Advanced Price Prediction ---
Products with prices: 215
Products needing price prediction: 97

Training price prediction model...
  Gradient Boosting: MAE=$31.77, R²=0.156
  Random Forest: MAE=$32.97, R²=0.143
  Ridge Regression: MAE=$30.13, R²=0.394

Using Ridge Regression for final predictions...
✓ Predicted prices for 97 products
  Price range: $9.32 - $241.70
  Mean predicted price: $76.47


In [13]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [14]:
!pip install -q langchain langchain-community langchain-google-genai

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.5 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.8.0 which is incompatible.[0m[31m
[0m

In [15]:
!pip install -q google-generativeai

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 2.1.12 requires google-ai-generativelanguage<1,>=0.7, but you have google-ai-generativelanguage 0.6.15 which is incompatible.[0m[31m
[0m

In [16]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
# --- 5. LLM for Data Enrichment ---
print("\n" + "=" * 60)
print("STEP 5: LLM ENRICHMENT WITH GEMINI 2.0 FLASH")
print("=" * 60)

if not GOOGLE_API_KEY:
    print("⚠️  ERROR: GOOGLE_API_KEY not found!")
    print("Please set your Gemini API key in the environment variables (Cell 4)")
    print("Get your key from: https://makersuite.google.com/app/apikey")
    df['enriched_description'] = df['description']
else:
    try:
        from langchain_google_genai import ChatGoogleGenerativeAI
        import time

        print("✓ Initializing Gemini 2.0 Flash...")
        llm = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash",
            google_api_key=GOOGLE_API_KEY,
            temperature=0.3,
            max_tokens=200
        )

        prompt_template = """You are a product catalog manager. Write a clean, concise product description (2-3 sentences max) based on the structured data provided.
Use ONLY the information provided. If information is missing, do not mention it.

Product Data:
- Title: {title}
- Price: {price}
- Categories: {categories}
- Brand: {brand}
- Material: {material}
- Color: {color}
- Dimensions: {dimensions}

Product Description:"""

        prompt = PromptTemplate(
            template=prompt_template,
            input_variables=["title", "price", "categories", "brand", "material", "color", "dimensions"]
        )

        # Create chain using LCEL
        enrichment_chain = prompt | llm | StrOutputParser()

        df['enriched_description'] = df['description']
        rows_to_enrich = df[df['enriched_description'].isna() | (df['enriched_description'].str.split().str.len() < 5)]

        if not rows_to_enrich.empty:
            total_to_enrich = len(rows_to_enrich)
            print(f"Enriching {total_to_enrich} product descriptions...")
            print("⚠️  Free tier limit: 10 requests/minute")
            print(f"Estimated time: ~{(total_to_enrich * 7) / 60:.1f} minutes")
            print("Processing with rate limiting...\n")

            successful_enrichments = 0
            failed_enrichments = 0
            request_count = 0
            start_time = time.time()

            for index, row in tqdm(rows_to_enrich.iterrows(), total=total_to_enrich, desc="Gemini Enrichment"):
                try:
                    # Rate limiting: 10 requests per minute (free tier)
                    if request_count > 0 and request_count % 10 == 0:
                        elapsed = time.time() - start_time
                        if elapsed < 60:
                            wait_time = 62 - elapsed  # Wait 62 seconds to be safe
                            print(f"\n⏳ Rate limit: waiting {wait_time:.0f}s for next batch...")
                            time.sleep(wait_time)
                        start_time = time.time()  # Reset timer

                    price_str = f"${row['price_cleaned']:.2f}" if pd.notna(row['price_cleaned']) else "Not specified"

                    inputs = {
                        'title': str(row['title']) if pd.notna(row['title']) else "Product",
                        'price': price_str,
                        'categories': str(row['categories']) if pd.notna(row['categories']) else "Not specified",
                        'brand': str(row['brand']) if pd.notna(row['brand']) else "Not specified",
                        'material': str(row['material']) if pd.notna(row['material']) else "Not specified",
                        'color': str(row['color']) if pd.notna(row['color']) else "Not specified",
                        'dimensions': str(row['package_dimensions']) if pd.notna(row['package_dimensions']) else "Not specified"
                    }

                    response = enrichment_chain.invoke(inputs)
                    df.loc[index, 'enriched_description'] = response.strip()
                    successful_enrichments += 1
                    request_count += 1

                    # Small delay between requests for stability
                    time.sleep(0.5)

                except Exception as e:
                    failed_enrichments += 1
                    # Use title as fallback description
                    df.loc[index, 'enriched_description'] = str(row['title']) if pd.notna(row['title']) else "Product"
                    if failed_enrichments <= 3:  # Only show first few errors
                        print(f"\n⚠️  Error enriching product {index}: {str(e)[:150]}")

            print(f"\n✓ Successfully enriched: {successful_enrichments} descriptions")
            if failed_enrichments > 0:
                print(f"⚠️  Failed enrichments: {failed_enrichments} (using fallback descriptions)")
        else:
            print("✓ All descriptions are already complete")

    except Exception as e:
        print(f"\n⚠️  Failed to initialize Gemini: {e}")
        print("Using original descriptions without enrichment...")
        df['enriched_description'] = df['description']



STEP 5: LLM ENRICHMENT WITH GEMINI 2.0 FLASH
✓ Initializing Gemini 2.0 Flash...
Enriching 156 product descriptions...
⚠️  Free tier limit: 10 requests/minute
Estimated time: ~18.2 minutes
Processing with rate limiting...



Gemini Enrichment:   0%|          | 0/156 [00:00<?, ?it/s]


⏳ Rate limit: waiting 45s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 47s for next batch...

⏳ Rate limit: waiting 45s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 47s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

⏳ Rate limit: waiting 46s for next batch...

✓ Successfully enriched: 156 descriptions


In [33]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-24.2-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, pinecone-plugin-assi

In [17]:
# --- 6. Pinecone Setup & Upsert ---
print("\n" + "=" * 60)
print("STEP 6: UPLOADING TO PINECONE")
print("=" * 60)

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating new index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=max_dim,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
else:
    print(f"Using existing index: {INDEX_NAME}")

index = pc.Index(INDEX_NAME)

# Upsert in batches
batch_size = 100  # Increased batch size for better performance
for i in tqdm(range(0, len(df), batch_size), desc="Upserting to Pinecone"):
    batch = df.iloc[i:i+batch_size]
    vectors_to_upsert = []

    for _, row in batch.iterrows():
        final_embedding = embedding_matrix[row.name].tolist()

        try:
            first_image = ast.literal_eval(row['images'])[0].strip() if pd.notna(row['images']) else ''
        except:
            first_image = ''

        price_str = f"${row['price_cleaned']:.2f}" if pd.notna(row['price_cleaned']) else "Price not available"

        metadata = {
            'title': str(row['title']) if pd.notna(row['title']) else '',
            'description': str(row['enriched_description']) if pd.notna(row['enriched_description']) else '',
            'image': first_image,
            'price': price_str,
            'categories': str(row['categories']) if pd.notna(row['categories']) else '',
            'brand': str(row['brand']) if pd.notna(row['brand']) else '',
            'material': str(row['material']) if pd.notna(row['material']) else '',
            'color': str(row['color']) if pd.notna(row['color']) else '',
            'package_dimensions': str(row['package_dimensions']) if pd.notna(row['package_dimensions']) else ''
        }

        vectors_to_upsert.append((row['uniq_id'], final_embedding, metadata))

    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)

print("\n" + "=" * 60)
print("PIPELINE COMPLETED!")
print("=" * 60)
print("\nIndex Statistics:")
print(index.describe_index_stats())

if device == "cuda":
    print(f"\nFinal GPU Memory Usage: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"Peak GPU Memory Usage: {torch.cuda.max_memory_allocated(0) / 1e9:.2f} GB")

print("\n✅ All data successfully processed and uploaded to Pinecone!")


STEP 6: UPLOADING TO PINECONE
Using existing index: furniture-recommender


Upserting to Pinecone:   0%|          | 0/4 [00:00<?, ?it/s]


PIPELINE COMPLETED!

Index Statistics:
{'dimension': 1152,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 305}},
 'total_vector_count': 305,
 'vector_type': 'dense'}

Final GPU Memory Usage: 0.44 GB
Peak GPU Memory Usage: 0.59 GB

✅ All data successfully processed and uploaded to Pinecone!
