In [24]:
import pandas as pd 
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from vector_retrieval_model import ProductSearchEngine
import numpy as np
import os
import torch
print("code strted")
df_product = pd.read_parquet('/Users/mohammed/Desktop/Context-aware-Search-Engine-for-E-commerce/data/shopping_queries_dataset_products.parquet')
product_df = df_product.head(10000).copy()
print("data read, data combining")
# Combine all relevant fields into a single string for each product
product_df = product_df.fillna('')
product_df['combined_text'] = (
    product_df['product_title'] + ' ' +
    product_df['product_description'] + ' ' +
    product_df['product_bullet_point'] + ' ' +
    product_df['product_brand'] + ' ' +
    product_df['product_color'] + ' ' +
    product_df['product_locale']
)
print("data combination completed!")
device = 'mps' if torch.mps.is_available() else 'cpu'
print(f"Using device: {device}")

# Initialize the Sentence-BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=device)  # or another model

# Vectorize the combined text (convert text into embeddings)
product_embeddings = model.encode(product_df['combined_text'].dropna().tolist())
print("code finished!!")

embedding_dim = product_embeddings.shape[1]
if not os.path.exists('product_index.ann'):
    print("Building index because it doesn't exist.")
    ProductSearchEngine.build_index(product_embeddings, 'product_index.ann')

# Load the index and search
engine = ProductSearchEngine(embedding_dim=embedding_dim, index_path='product_index.ann', product_df=product_df)

query = "iPhone 12 case"
query_embedding = model.encode([query])[0]
results_df = engine.search(query_embedding, k=5)
print(results_df)


code strted
data read, data combining
data combination completed!
Using device: mps
code finished!!
Building index because it doesn't exist.
   Product ID                                      Product Title  Distance
0  B00AYNRLFA  Belkin MIXIT - Base de carga y sincronización ...    1.0959
1  B0716LV2HV  AirPods Case Protective, FRTMA Silicone Skin C...    1.1080
2  B01BY5JC26  AUKEY Bateria Externa 20000mAh Power Bank con ...    1.1272
3  B07YJL253D  YARBER Batería para iPhone 6 Plus, Alta Capaci...    1.1296
4  B01LSUZDWU  Apple iPhone 7 Plus 128 GB Negro mate - Smartp...    1.1330


In [28]:
query = "AirPods 3 Iphone 12"
query_embedding = model.encode([query])[0]
results_df = engine.search(query_embedding, k=5)
for title in results_df["Product Title"]:
    print(title)



Auriculares Bluetooth Auriculares Inalámbricos Bluetooth táctiles 3D estéreo HD Micrófono Auricular Emparejamiento Automático de llamadas Binaurales para Apple Airpods Pro Android iPhone 11
AirPods Case Protective, FRTMA Silicone Skin Case with Sport Strap for Apple AirPods (Red)
FRTMA Protective Case Compatible with AirPods Pro [2019 Release], Full-Body Waterproof Protective AirPods Carrying Case Shockproof Cover Skin Supports AirPods 3, Midnight Blue
Auriculares Inalámbricos Bluetooth 5.0 con Caja de Carga Rápida, HIFI Estéreo Auriculares con HD Mic, Auriculares Inalámbricos para iPhone Airpods pro Samsung Android y Otros Teléfonos Inteligentes
fghdjjfdh Auriculares Bluetooth, Auriculares inalámbricos Bluetooth In-Ear Mini Auriculares Auriculares Deportivos para Apple Airpods iPhone, Android Negro


In [12]:

# Step 5: Build the Annoy Index
# Initialize the Annoy index
vector_dimension = len(product_embeddings[0])  # The length of the product vector (embedding)
annoy_index = AnnoyIndex(vector_dimension, 'angular')  # 'angular' corresponds to cosine similarity

# Add product vectors to the Annoy index
for i, vector in enumerate(product_embeddings):
    annoy_index.add_item(i, vector)

# Build the Annoy index (this process might take time depending on the number of products)
annoy_index.build(100)  # 10 trees for a balance between speed and accuracy

# Step 6: Query the Annoy Index


True

In [11]:
# Example query from the user
query = "iPhone 12 case"

# Step 6.1: Vectorize the query using the same Sentence-BERT model
query_embedding = model.encode([query])[0]  # Convert the query into a vector (embedding)

# Step 6.2: Search for the most similar products (k nearest neighbors)
k = 5  # Number of nearest neighbors to retrieve
nearest_neighbors = annoy_index.get_nns_by_vector(query_embedding, k, include_distances=True)

# Step 7: Output the results
print(f"Query: {query}")
print("\nMost similar products:")

# Prepare structured data for display
results = []
for idx, dist in zip(nearest_neighbors[0], nearest_neighbors[1]):
    results.append({
        "Product ID": product_df['product_id'][idx],
        "Product Title": product_df['product_title'][idx],
        "Distance": round(dist, 4)
    })

# Convert to DataFrame for a clean tabular view
results_df = pd.DataFrame(results)
print(results_df.head())


Query: iPhone 12 case

Most similar products:
   Product ID                                      Product Title  Distance
0  B07Q41TS53  Samsung Galaxy A70 - Smartphone 4G (6,7'' - 12...    1.1598
1  B07Z6223Q3  AGPTEK 4 EN 1 Hub USB C para iPad Pro 2018, 20...    1.2136
2  B081GW9SF4  Adaptador de Auriculares para iPhone X Adaptad...    1.2183
3  B07TVDJWJJ  Adaptador de Auriculares para iPhone Adaptador...    1.2308
4  B07YTLW74W  Adaptador de Auriculares para iPhone X Adaptad...    1.2336
