In [1]:
!unzip model.zip

Archive:  model.zip
   creating: fine_tuned_furniture_model/
  inflating: fine_tuned_furniture_model/config.json  
  inflating: fine_tuned_furniture_model/model.safetensors  
  inflating: fine_tuned_furniture_model/modules.json  
  inflating: fine_tuned_furniture_model/tokenizer.json  
  inflating: fine_tuned_furniture_model/vocab.txt  
  inflating: fine_tuned_furniture_model/README.md  
  inflating: fine_tuned_furniture_model/sentence_bert_config.json  
  inflating: fine_tuned_furniture_model/config_sentence_transformers.json  
   creating: fine_tuned_furniture_model/1_Pooling/
  inflating: fine_tuned_furniture_model/1_Pooling/config.json  
  inflating: fine_tuned_furniture_model/tokenizer_config.json  
   creating: fine_tuned_furniture_model/2_Normalize/
  inflating: fine_tuned_furniture_model/special_tokens_map.json  


In [None]:
# ---
# Main FastAPI Application
# ---
# This script sets up the backend server for the product recommendation web app.
# It includes two main endpoints:
# 1. /recommend: Accepts a user query, finds similar products using a multi-modal embedding
#    search in Pinecone, and generates new descriptions using LangChain.
# 2. /analytics: Provides summary statistics about the product dataset.
# ---

# 1. Import necessary libraries
#pip install "pinecone-client>=3.0.0" "fastapi" "uvicorn" "python-dotenv" "pandas" "numpy" "torch" "sentence-transformers" "langchain" "langchain-community" "transformers" "accelerate" "bitsandbytes"
import os
import pandas as pd
import numpy as np
import torch
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from dotenv import load_dotenv

# Import Pinecone class
from pinecone import Pinecone

from sentence_transformers import SentenceTransformer
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# 2. Setup and Initialization

# --- Application Setup ---
app = FastAPI(
    title="Product Recommendation API",
    description="API for furniture product recommendations and analytics.",
    version="1.0.0"
)

# --- CORS (Cross-Origin Resource Sharing) ---
# Allows the frontend (running on a different port) to communicate with this backend.
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, restrict this to your frontend's domain
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# --- Environment and Device Setup ---
load_dotenv() # Load environment variables from .env file
device_str = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"--- Using device: {device_str} ---")
device = 0 if device_str == 'cuda' else -1

# --- Load Data ---
# This dataframe acts as our main database to retrieve product details.
try:
    # Adjust path assuming the script is run from the 'backend' directory
    DATA_PATH = '/content/cleaned_intern_data.csv'
    df = pd.read_csv(DATA_PATH)
    # Set 'uniq_id' as the index for quick lookups
    df.set_index('uniq_id', inplace=True)
    print("--- Cleaned dataset loaded successfully. ---")
except FileNotFoundError:
    print(f"ERROR: Data file not found at {DATA_PATH}. Please ensure the path is correct.")
    df = None

# --- AI Model Loading ---
print("--- Loading AI models... This may take a moment. ---")
# Text Embedding Model (our fine-tuned version)
text_model = SentenceTransformer('/content/fine_tuned_furniture_model', device=device_str)
# Image Embedding Model (pre-trained CLIP)
image_model = SentenceTransformer('clip-ViT-B-32', device=device_str)
print("--- Embedding models loaded. ---")

# Generative AI Model for Descriptions (using LangChain)
llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-small",
    task="text2text-generation",
    model_kwargs={"temperature": 0.6, "max_length": 64},
    device=device, # Use -1 for CPU
)

prompt_template = PromptTemplate(
    input_variables=["title", "original_description"],
    template="Generate a creative and appealing one-sentence product description for a piece of furniture titled '{title}'. The original description is: '{original_description}'"
)

llm_chain = LLMChain(llm=llm, prompt=prompt_template)
print("--- Generative AI model loaded. ---")


# --- Vector Database (Pinecone) Initialization ---
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "pcsk_5pv4yK_7egZVBomhiC2qKLmyGjFjpivFrfG92HfHkSiZ1Z1PWXvVQv83U1seg9SZ2ZJCx6")
PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT", "us-east-1")
INDEX_NAME = "product-recommendations"

# Initialize Pinecone using the new method
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)


# New code - Correct
if INDEX_NAME in pc.list_indexes().names():
    index = pc.Index(INDEX_NAME)
    print("--- Pinecone index loaded successfully. ---")
else:
    print(f"ERROR: Pinecone index '{INDEX_NAME}' not found.")
    index = None

# --- Get model dimensions for query embedding ---
text_embedding_dim = text_model.get_sentence_embedding_dimension()
image_embedding_dim = image_model.get_sentence_embedding_dimension()


# 3. Pydantic Models for Data Validation

class QueryRequest(BaseModel):
    """Request model for the /recommend endpoint."""
    query: str

class Product(BaseModel):
    """Response model for a single product."""
    id: str
    title: str
    price: str | None
    image_url: str
    generated_description: str

class RecommendationResponse(BaseModel):
    """Response model for the /recommend endpoint."""
    products: list[Product]

class AnalyticsResponse(BaseModel):
    """Response model for the /analytics endpoint."""
    brand_counts: dict
    category_counts: dict
    price_stats: dict


# 4. API Endpoints

@app.get("/")
def read_root():
    """Root endpoint to welcome users."""
    return {"message": "Welcome to the Product Recommendation API"}

@app.post("/recommend", response_model=RecommendationResponse)
async def recommend_products(request: QueryRequest):
    """
    Recommends products based on a user's text query.
    """
    if index is None or df is None:
        return {"products": []}

    # 1. Create a multi-modal query embedding
    # We only have text from the user, so we use a zero vector for the image part.
    text_query_embedding = text_model.encode(request.query, convert_to_numpy=True)
    image_query_embedding = np.zeros(image_embedding_dim)

    # Concatenate to match the format in Pinecone
    query_embedding = np.concatenate([text_query_embedding, image_query_embedding]).tolist()

    # 2. Query Pinecone to get top 5 similar product IDs
    query_results = index.query(vector=query_embedding, top_k=5)
    result_ids = [match['id'] for match in query_results['matches']]

    # 3. Fetch product details from our dataframe
    recommended_products_df = df.loc[result_ids]

    # 4. Generate creative descriptions and format the response
    creative_descriptions = []
    for _, product_data in recommended_products_df.iterrows():
        creative_description = llm_chain.run({
            "title": product_data["title"],
            "original_description": product_data["description"]
        })
        creative_descriptions.append(creative_description.strip())

    products_response = []
    for i, (uniq_id, product_data) in enumerate(recommended_products_df.iterrows()):
        products_response.append(Product(
            id=uniq_id,
            title=product_data["title"],
            price=str(product_data.get("price", "N/A")),
            image_url=product_data["images"].split(',')[0], # Use first image
            generated_description=creative_descriptions[i]
        ))

    return {"products": products_response}


@app.get("/analytics", response_model=AnalyticsResponse)
async def get_analytics():
    """
    Provides key analytics from the dataset.
    """
    if df is None:
        return {"brand_counts": {}, "category_counts": {}, "price_stats": {}}

    # Calculate top 10 brands
    brand_counts = df['brand'].value_counts().nlargest(10).to_dict()

    # Calculate top 10 categories
    category_counts = df['categories'].value_counts().nlargest(10).to_dict()

    # Calculate price statistics
    price_stats = {
        'mean': round(df['price_cleaned'].mean(), 2),
        'median': round(df['price_cleaned'].median(), 2),
        'max': round(df['price_cleaned'].max(), 2)
    }

    return {
        "brand_counts": brand_counts,
        "category_counts": category_counts,
        "price_stats": price_stats
    }


Defaulting to user installation because normal site-packages is not writeable
Collecting pinecone-client>=3.0.0
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting fastapi
  Downloading fastapi-0.119.0-py3-none-any.whl.metadata (28 kB)
Collecting uvicorn
  Downloading uvicorn-0.37.0-py3-none-any.whl.metadata (6.6 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.3.4-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting torch
  Downloading torch-2.9.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: C:\Users\navon\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Exception: The official Pinecone python package has been renamed from `pinecone-client` to `pinecone`. Please remove `pinecone-client` from your project dependencies and add `pinecone` instead. See the README at https://github.com/pinecone-io/pinecone-python-client for more information on using the python SDK.