In [None]:
!pip install pandas sentence-transformers pinecone-client tqdm

import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, PodSpec, ServerlessSpec
from tqdm.auto import tqdm
import os
import time

In [None]:
PINECONE_API_KEY = "pcn-1234567890abcdef1234567890abcdef" 
INDEX_NAME = "furniture-recommender"
DATASET_FILE = "intern_data_ikarus.csv"
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

In [None]:
METADATA_COLS = ['title', 'brand', 'price', 'categories', 'images', 'material', 'color', 'uniq_id']

In [None]:
print(f"Loading dataset: {DATASET_FILE}")
try:
    df = pd.read_csv(DATASET_FILE)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file {DATASET_FILE} was not found.")
    # You would typically stop here or load mock data
    
print("\nDataFrame Head:")
print(df.head())

print("\nDataFrame Info:")
print(df.info())

In [None]:

df['description'] = df['description'].fillna('')
df['categories'] = df['categories'].fillna('')
df['material'] = df['material'].fillna('')
df['color'] = df['color'].fillna('')

df['combined_text'] = (
    df['title'] + ' ' + 
    df['description'] + ' ' + 
    df['categories'] + ' ' + 
    df['material'] + ' ' + 
    df['color']
)

df = df.dropna(subset=['uniq_id'])
df['uniq_id'] = df['uniq_id'].astype(str) # Ensure unique ID is a string for the vector ID

print(f"\nTotal products for embedding: {len(df)}")
print(f"Example combined text: {df['combined_text'].iloc[0][:150]}...")

In [None]:
print(f"\nLoading Sentence Transformer model: {EMBEDDING_MODEL_NAME}")
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
EMBEDDING_DIMENSION = model.get_sentence_embedding_dimension()
print(f"Model loaded. Embedding dimension: {EMBEDDING_DIMENSION}")

print("Generating embeddings for all products...")
corpus_embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)
print("Embeddings generation complete.")

In [None]:

try:
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    if INDEX_NAME not in pc.list_indexes().names:
        print(f"Creating Pinecone index: {INDEX_NAME}...")
        
        # Use ServerlessSpec for the free tier/modern deployment
        pc.create_index(
            name=INDEX_NAME, 
            dimension=EMBEDDING_DIMENSION,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-west-2')
        )
        print("Index created. Waiting for index initialization...")
        while not pc.describe_index(INDEX_NAME).status['ready']:
            time.sleep(1)
            
    index = pc.Index(INDEX_NAME)
    print(f"Connected to index '{INDEX_NAME}'. Current stats:")
    print(index.describe_index_stats())
    
except Exception as e:
    print(f"FATAL ERROR: Could not connect or create Pinecone index. Check API key and environment.")
    print(f"Details: {e}")

In [None]:
BATCH_SIZE = 100
vectors_to_upsert = []

print("\nStarting batch upsert to Pinecone...")
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    i_end = min(i + BATCH_SIZE, len(df))
    batch = df.iloc[i:i_end]
    
    embedding_batch = corpus_embeddings[i:i_end]
    
    to_upsert = []
    for j, (index_id, row) in enumerate(batch.iterrows()):
        
        metadata = {col: row[col] for col in METADATA_COLS if col in row}
        
        to_upsert.append((str(row['uniq_id']), embedding_batch[j].tolist(), metadata))
        
    vectors_to_upsert.extend(to_upsert)

    try:
        index.upsert(vectors=to_upsert)
    except Exception as e:
        print(f"Error during upsert for batch {i//BATCH_SIZE}: {e}")

print("\nUpsert process complete.")
final_count = index.describe_index_stats().total_vector_count
print(f"Final total vectors in index: {final_count}")

