This notebook for creating RAG database

In [None]:
import os
import re
import math
import json
from tqdm import tqdm
from dotenv import load_dotenv
from huggingface_hub import login
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from items import Item
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
hf_token = os.environ.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)
DB = "products_vectorstore"

In [None]:
with open('train.pkl', 'rb') as f:
    train = pickle.load(f)
train[0].prompt 

## Creating ChromaDB

In [None]:
client = chromadb.PersistentClient(path=DB)

In [None]:
# Check if the collection exists and delete it if it doesnot
collection_name = 'products'
exists = [collection.name for collection in client.list_collections()]
if collection_name in exists :
    client.delete_collection(collection_name)
    print(f'deleted collection {collection_name}')

collection = client.create_collection(collection_name)
collection


## SentenceTransformer

ideal for tasks like semantic search

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
vector = model.encode('My Name is Marwan')
len(vector)

In [None]:
def description(item):
    text = item.prompt.replace('How much does this cost to the nearest dollar?\n\n','')
    return text.split('\n\nPrice is $')[0]  # Return just the description part

In [None]:
description(train[0])

In [None]:
for i in tqdm(range(0, len(train), 1000)):
    documents = [description(item) for item in train[i:i+1000]]
    vectors = model.encode(documents).astype(float).tolist()
    metadatas = [{'category': item.category, 'price': item.price} for item in train[i:i+1000]]
    ids = [f'doc_{j}' for j in range(i, i+1000)]
    collection.add(
        ids=ids,    
        documents=documents,        
        embeddings=vectors,
        metadatas=metadatas
    )