In [None]:
import cv2
import numpy as np
import faiss
import pickle
import requests
import os
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [47]:
product_df = pd.read_csv('Amazon-Products.csv')
product_name = product_df['name']

In [48]:
product_name = product_name.str.replace('''[^\w\d\s]''','',regex=True).str.lower()
product_name = product_name.drop_duplicates()
product_name = product_name.str.lower()
product_name = product_name.tolist()

In [49]:
batch_size = 1000
embedding_file = "/content/drive/EcommerceModel/embeddings.npy"

# Generate embeddings in batches and save them
all_embeddings = []
for i in range(0, len(product_name), batch_size):
    batch_texts = product_name[i:i+batch_size]
    batch_embeddings = model.encode(batch_texts, device=device).astype('float32')  # Ensure embeddings are on GPU
    all_embeddings.append(batch_embeddings)

all_embeddings = np.vstack(all_embeddings)

# Load embeddings


In [50]:
np.save('embeddings.npy', all_embeddings)


In [51]:
print(len(all_embeddings))

228428


In [52]:
all_embeddings = np.load('embeddings.npy')
dimension = all_embeddings.shape[1]

# FAISS-GPU setup
res = faiss.StandardGpuResources()  # Initialize GPU resources
nlist = 100  # Number of clusters
quantizer = faiss.IndexFlatL2(dimension)  # Base quantizer
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Convert to GPU index
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Using GPU ID 0

# Train the index on the embeddings
gpu_index.train(all_embeddings)

# Add embeddings in batches
batch_size = 10000
for i in range(0, all_embeddings.shape[0], batch_size):
    batch_embeddings = all_embeddings[i:i+batch_size]
    gpu_index.add(batch_embeddings)

# Save the index to disk
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index), "large_index.ivf")  # Save a CPU copy of the index

# Reload the index from disk
index = faiss.read_index("large_index.ivf")
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move it back to GPU


In [53]:
# Query example
query = "iphone"
query_embedding = model.encode([query], device=device).astype('float32')  # Ensure query embedding is computed on GPU

# Perform similarity search
k = 50  # Top 10 results
distances, indices = gpu_index.search(query_embedding, k)

# Display results
print("Query:", query)
for i in range(k):
    print(f"Result {i+1}:")
    print(f"  Text: {product_name[indices[0][i]]}")
    print(f"  Distance: {distances[0][i]}")

Query: iphone
Result 1:
  Text: apple iphone 14 plus 256 gb  blue
  Distance: 0.8725132942199707
Result 2:
  Text: apple iphone 13 256gb  midnight
  Distance: 0.8844701647758484
Result 3:
  Text: apple iphone 13 128gb  midnight
  Distance: 0.9230003356933594
Result 4:
  Text: apple iphone 12 64gb  white
  Distance: 0.9341724514961243
Result 5:
  Text: apple iphone 12 64gb  blue
  Distance: 0.9800865650177002
Result 6:
  Text: apple iphone 12 64gb  black
  Distance: 0.9819309711456299
Result 7:
  Text: apple iphone 14 pro 128 gb  deep purple
  Distance: 1.002982258796692
Result 8:
  Text: apple iphone 13 256gb  blue
  Distance: 1.0047187805175781
Result 9:
  Text: apple iphone 12 128gb  black
  Distance: 1.0181214809417725
Result 10:
  Text: apple iphone 14 128 gb  product red
  Distance: 1.0184563398361206
Result 11:
  Text: apple iphone 13 128gb  blue
  Distance: 1.0222930908203125
Result 12:
  Text: apple iphone 14 pro max 256 gb  deep purple
  Distance: 1.030189871788025
Result 13:


### TF-IDF

In [None]:
import cupy as cp
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.stem.porter import PorterStemmer
from multiprocessing import Pool
import cupyx.scipy.sparse as cusparse

In [None]:
product = pd.read_csv('/content/drive/MyDrive/EcommerceData/Amazon-Products.csv')
product_df = product['name'].drop_duplicates()

In [None]:
print(len(product_df))

396210


In [None]:
tmp = product_df.str.replace('''[^\w\d\s]''','',regex=True).str.lower()

In [None]:
stemmer = PorterStemmer()
def stemming(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [None]:
product_name = product_name.apply(stemming)

In [None]:
product.to_csv('/content/drive/MyDrive/EcommerceData/stemming-name.csv',index=False)

In [None]:
product_name = pd.read_csv('/content/drive/MyDrive/EcommerceData/stemming-name.csv')

In [None]:
product_name = product_name['name'].tolist()

In [None]:
print(len(product_name))

396210


In [None]:
vectorizer = TfidfVectorizer(max_features=1000,stop_words='english',dtype=np.int8)
tfidf_matrix = vectorizer.fit_transform(product_name)



In [None]:
print(tfidf_matrix.shape)

(396210, 1000)


In [None]:
input_text = "iphone"
input_vector = vectorizer.transform([input_text])  # Sparse matrix for input

tfidf_gpu = cusparse.csr_matrix(tfidf_matrix)  # GPU sparse matrix
input_gpu = cusparse.csr_matrix(input_vector)   # GPU sparse matrix for input
print(tfidf_gpu.shape)
print(input_gpu.shape)
# Step 4: Compute cosine similarity
dot_products = tfidf_gpu @ input_gpu.T          # Sparse dot product on GPU
norms = cp.sqrt(tfidf_gpu.multiply(tfidf_gpu).sum(axis=1)) * cp.sqrt(input_gpu.multiply(input_gpu).sum())
print(dot_products.shape)
print(norms.shape)

(396210, 1000)
(1, 1000)
(396210, 1)
(396210, 1)


In [None]:
# Step 5: Transfer similarities back to CPU
similarities_cpu = dot_products.toarray()
print(similarities_cpu.shape)
print(type(similarities_cpu))
# Step 6: Get top 10 most similar texts

(396210, 1)
<class 'cupy.ndarray'>


In [None]:
product_df = product_df.to_frame()
product_df = product_df.reset_index(drop=True) # Reset index to avoid length mismatch
print(len(product_df))


AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [None]:
product_df["id"] = similarities_cpu.get()
product_df.head(20)
top_similar_texts = product_df.sort_values(by="id",ascending=False)
top_similar_texts.head(10)
# # Display results
# print("Top Similar Texts:")
# print(top_similar_texts)

Unnamed: 0,name,id
18681,Zapcase Back Case Cover for iPhone 7 / iPhone ...,0.97801
13914,POPIO Tempered Glass Compatible for iPhone 6; ...,0.898071
18600,Spigen Ultra Hybrid Back Cover Case for iPhone...,0.893437
16723,MINDFIED 20W Fast Charger with Cable Compatibl...,0.871093
16249,EGOTUDE Ultra Thin Slim Anti Scratch Back Cove...,0.857507
323506,Shopnet Wireless Bluetooth Speaker TG113 For i...,0.831352
18625,EGOTUDE Dual Layer Hard Back Translucent Hybri...,0.830391
13009,Amozo Designed for iPhone 14 / iPhone 13 Cover...,0.754027
18097,Belkin 18W USB Type C Adapter (iPhone Fast Cha...,0.744139
17300,DR VAKU® 20W iPhone Charger Type C Adapter for...,0.74328


In [None]:
print(len(similarities_cpu))

396210


In [None]:
product_df['name'].str.endswith('...').sum()

92207

In [None]:
dense_matrix = cp.asnumpy(tfidf_gpu.toarray())  # Converts GPU matrix to NumPy array

# Step 2: Map names to vectors
name_to_vector = dict(zip(product_df['name'], dense_matrix))

In [None]:
import pickle

with open("/content/drive/MyDrive/EcommerceData/text-vectors.pkl", "wb") as f:
    pickle.dump(name_to_vector, f)