In [5]:
pip install sentence-transformers pinecone-client "langchain[docarray]" "huggingface_hub[cli]"

Note: you may need to restart the kernel to use updated packages.




In [6]:
pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting SQLAlchemy<3.0.0,>=1.4.0 (from langchain-community)
  Downloading sqlalchemy-2.0.44-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.13.1-cp313-cp313-win_amd64.whl.metadata (8.4 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.10.1 (from langchain-community)
  Downloading pydantic_settings-2.11.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Down

In [8]:
pip install langchain-pinecone

Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.12-py3-none-any.whl.metadata (8.6 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain-pinecone)
  Downloading langchain_core-0.3.79-py3-none-any.whl.metadata (3.2 kB)
Collecting pinecone<8.0.0,>=6.0.0 (from pinecone[asyncio]<8.0.0,>=6.0.0->langchain-pinecone)
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting langchain-openai>=0.3.11 (from langchain-pinecone)
  Downloading langchain_openai-1.0.0-py3-none-any.whl.metadata (1.8 kB)
Collecting simsimd>=5.9.11 (from langchain-pinecone)
  Downloading simsimd-6.5.3-cp313-cp313-win_amd64.whl.metadata (71 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone<8.0.0,>=6.0.0->pinecone[asyncio]<8.0.0,>=6.0.0->langchain-pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting aiohttp-retry<3.0.0,>=2.9.1 (from pinecone[asyncio]<8.0.0,>=6.0.0->langchain-pinecone)
  Downloading aiohttp_retry-2.

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 1.0.0 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.
langchain-classic 1.0.0 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.
langchain-community 0.4 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.
langchain-text-splitters 1.0.0 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.


In [9]:
# In notebooks/Model_Training.ipynb

# --- 1. Imports and Data Loading ---

# First, ensure you have installed all the necessary packages in your terminal:
# pip install langchain-community langchain-pinecone sentence-transformers pinecone-client

import pandas as pd
import os
from langchain_community.embeddings import HuggingFaceEmbeddings # Corrected import
from langchain_pinecone import PineconeVectorStore              # Corrected import
from pinecone import Pinecone
from getpass import getpass # Used to securely ask for API keys

# REASONING: We need to load the same cleaned dataset from our previous notebook.
# To do this, we'll quickly re-run the cleaning steps to ensure our data is ready.
# In a production system, you would save the cleaned CSV and load it directly.

# Load the dataset
df = pd.read_csv('../data/furniture_dataset.csv')

# --- Re-apply the cleaning steps ---
df['brand'] = df['brand'].fillna('Unknown Brand')
df['material'] = df['material'].fillna('Unknown Material')
df['color'] = df['color'].fillna('Unknown Color')

for index, row in df[df['description'].isnull()].iterrows():
    imputed_description = f"This is a {row['title']} from {row['brand']}. It is made of {row['material']} and comes in a {row['color']} color."
    df.loc[index, 'description'] = imputed_description

df['manufacturer'] = df['manufacturer'].fillna('Unknown')
df['country_of_origin'] = df['country_of_origin'].fillna('Unknown')
df['price'] = df['price'].str.replace('$', '', regex=False)
df['price'] = pd.to_numeric(df['price'], errors='coerce')
median_price = df['price'].median()
df['price'] = df['price'].fillna(median_price)
df.dropna(subset=['package_dimensions'], inplace=True)
df.reset_index(drop=True, inplace=True) # Reset index after dropping rows

print("Data loaded and cleaned.")
display(df.head())


# --- 2. Prepare Text for Embedding ---

# REASONING: The quality of our semantic search depends on the richness of the text.
# We will create a new column, 'combined_text', that merges several key fields.
# This gives the embedding model more context about each product.
df['combined_text'] = (
    "Title: " + df['title'] + "; " +
    "Brand: " + df['brand'] + "; " +
    "Description: " + df['description'] + "; " +
    "Material: " + df['material'] + "; " +
    "Color: " + df['color']
)

print("\nCreated 'combined_text' for embedding.")
display(df[['uniq_id', 'combined_text']].head())

  from .autonotebook import tqdm as notebook_tqdm


Data loaded and cleaned.



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,53.99,"['Home & Kitchen', 'Furniture', 'Dining Room F...",['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",Unknown,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,This is a Plant Repotting Mat MUYETOL Waterpro...,5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",Unknown,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
3,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,13.99,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",Unknown,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5
4,JOIN IRON Foldable TV Trays for Eating Set of ...,JOIN IRON Store,Set of Four Folding Trays With Matching Storag...,89.99,"['Home & Kitchen', 'Furniture', 'Game & Recrea...",['https://m.media-amazon.com/images/I/41p4d4VJ...,Unknown,"18.9""D x 14.2""W x 26""H",Unknown,Iron,Grey Set of 4,bdc9aa30-9439-50dc-8e89-213ea211d66a



Created 'combined_text' for embedding.


Unnamed: 0,uniq_id,combined_text
0,02593e81-5c09-5069-8516-b0b29f439ded,"Title: GOYMFK 1pc Free Standing Shoe Rack, Mul..."
1,5938d217-b8c5-5d3e-b1cf-e28e340f292e,"Title: subrtex Leather ding Room, Dining Chair..."
2,b2ede786-3f51-5a45-9a5b-bcf856958cd8,Title: Plant Repotting Mat MUYETOL Waterproof ...
3,8fd9377b-cfa6-5f10-835c-6b8eca2816b5,"Title: Pickleball Doormat, Welcome Doormat Abs..."
4,bdc9aa30-9439-50dc-8e89-213ea211d66a,Title: JOIN IRON Foldable TV Trays for Eating ...


In [11]:
# --- 3. Initialize Embedding Model and Pinecone ---

import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec # <-- IMPORT ServerlessSpec
from getpass import getpass

# REASONING: Securely get the user's Pinecone API key.
try:
    PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') or getpass("Enter your Pinecone API key: ")
except (KeyboardInterrupt, EOFError):
    print("\nOperation cancelled by user.")
    PINECONE_API_KEY = None

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_NAME = "product-recommendation"

if PINECONE_API_KEY:
    # REASONING: Initialize the embedding model.
    embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
    print("Embedding model initialized successfully.")

    # REASONING: Connect to our Pinecone account.
    pc = Pinecone(api_key=PINECONE_API_KEY)

    # REASONING: Check if the index already exists before creating it.
    if INDEX_NAME not in pc.list_indexes().names():
        print(f"Creating new Pinecone index: {INDEX_NAME}")
        # --- THIS IS THE CORRECTED PART ---
        # REASONING: The new Pinecone client requires a 'spec' object to define
        # the index type. For the free tier, we use ServerlessSpec and must
        # specify a cloud and region. 'aws' and 'us-east-1' are standard free choices.
        pc.create_index(
            name=INDEX_NAME,
            dimension=384,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
    else:
        print(f"Pinecone index '{INDEX_NAME}' already exists.")

    print("\nPinecone and model are ready.")
else:
    print("\nSkipping Pinecone initialization due to missing API key.")

Enter your Pinecone API key:  ········


Embedding model initialized successfully.
Creating new Pinecone index: product-recommendation

Pinecone and model are ready.


In [14]:
# --- 4. Generate Embeddings and Upsert to Pinecone ---

# REASONING: The LangChain 'from_texts' helper needs the API key to be set
# as an environment variable to establish its own connection. We will manually
# set it here using the key we already provided.
import os
os.environ['PINECONE_API_KEY'] = "pcsk_3ZyU89_PgTgVjWzZXadLHMuGPuQqtwwqM6rZym2gXsBS1ZsJ456SCPmG2teg7LNzGDahVY"

print("Preparing to generate and upsert embeddings...")

# Prepare the texts and their corresponding metadata
texts = df['combined_text'].tolist()
metadatas = [{'uniq_id': row['uniq_id']} for index, row in df.iterrows()]

# Generate embeddings and upsert them to Pinecone
# This single command will now work because it can find the API key.
vectorstore = PineconeVectorStore.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
    index_name=INDEX_NAME
)

print("\n✅ Success! All product embeddings have been generated and uploaded to Pinecone.")
print(f"You can now query the '{INDEX_NAME}' index to find similar products.")

Preparing to generate and upsert embeddings...

✅ Success! All product embeddings have been generated and uploaded to Pinecone.
You can now query the 'product-recommendation' index to find similar products.
