#This Notebook deals with generating the embeddings and using FAISS index.

## Mounting the Google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Installing packages

In [2]:
!pip install -q faiss-cpu sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


## Packages Used

In [3]:
import os
import time
import numpy as np
import pandas as pd
import faiss
import pickle
import csv
import torch
from sentence_transformers import SentenceTransformer

In [4]:
try:
    from tqdm import tqdm
    TQDM_ENABLED = True
except ImportError:
    TQDM_ENABLED = False
# Used exception handling as sometimes TQDM isn't being enabled

## Loading of the Product-level data

In [5]:
csv.field_size_limit(10**7)
df = pd.read_csv("/content/drive/MyDrive/amazon_product_level_optimized.csv", engine='python')
# This dataset is from the previous notebook, used the engine parameter as python for smoother reading of the file.
print(f"[INFO] Loaded product-level dataset: {df.shape}")
texts = df["combined_text"].astype(str).tolist()
product_ids = df["product_id"].tolist()
categories = df["category"].tolist()

[INFO] Loaded product-level dataset: (137635, 3)


## Setting up the GPU and the Embedder model

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[INFO] Using device: {device}")
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
embedder = SentenceTransformer(model_name, device=device)

[INFO] Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Embeddings Generation

In [7]:
embedding_path = "/content/drive/MyDrive/product_embeddings_optimized.npy"
batch_size = 256 if device == "cuda" else 64

if os.path.exists(embedding_path): # checks for the already existing product embeddings in the drive
    print("[INFO] Loading cached embeddings...")
    embeddings = np.load(embedding_path)
else:
    print("[INFO] Generating embeddings...")
    embeddings = []
    iterator = tqdm(range(0, len(texts), batch_size)) if TQDM_ENABLED else range(0, len(texts), batch_size)

    start = time.time()
    for i in iterator:
        batch = texts[i:i+batch_size]
        emb = embedder.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings)
    np.save(embedding_path, embeddings)
    print(f"[INFO] Embedding generation took {time.time()-start:.2f}s")
    print(f"[SAVED] Embeddings: {embedding_path}")

print(f"[INFO] Embeddings shape: {embeddings.shape}")

[INFO] Generating embeddings...


  return forward_call(*args, **kwargs)
100%|██████████| 538/538 [13:12<00:00,  1.47s/it]


[INFO] Embedding generation took 792.75s
[SAVED] Embeddings: /content/drive/MyDrive/product_embeddings_optimized.npy
[INFO] Embeddings shape: (137635, 384)


## Normalizing Embeddings and Building FAISS Index

In [9]:
#Normalize Embeddings (for Cosine Similarity), cosine is better for more accurate retrieval
faiss.normalize_L2(embeddings)

#Build FAISS Index (Cosine)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
print(f"[INFO] FAISS index built with {index.ntotal} vectors using cosine similarity")

[INFO] FAISS index built with 137635 vectors using cosine similarity


## Saving the files

In [10]:
index_path = "/content/drive/MyDrive/faiss_index_optimized.index"
mapping_path = "/content/drive/MyDrive/product_ids_optimized.pkl"

faiss.write_index(index, index_path)

mapping = {
    "product_ids": product_ids,
    "categories": categories,
    "embedding_model": model_name
}
with open(mapping_path, "wb") as f:
    pickle.dump(mapping, f)

print(f"[SAVED] FAISS index → {index_path}")
print(f"[SAVED] Metadata mapping → {mapping_path}")

[SAVED] FAISS index → /content/drive/MyDrive/faiss_index_optimized.index
[SAVED] Metadata mapping → /content/drive/MyDrive/product_ids_optimized.pkl


## Sample Retrieval

In [11]:
query = "cool toys for kids"
query_emb = embedder.encode([query], convert_to_numpy=True)
faiss.normalize_L2(query_emb)  # Normalize query as well
D, I = index.search(query_emb, k=5)

print("\n[INFO] Example Search Results:")
for i, idx in enumerate(I[0]):
    print(f"{i+1}. Product ID: {product_ids[idx]} | Category: {categories[idx]}")


[INFO] Example Search Results:
1. Product ID: B00G6PBLTM | Category: Toys and Games
2. Product ID: B002RL7WD8 | Category: Toys and Games
3. Product ID: B01DLLIPRO | Category: Toys and Games
4. Product ID: B00MIRVWXC | Category: Toys and Games
5. Product ID: B00SYIGH5C | Category: Toys and Games
