In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


Sentence embeddings:
tensor([[ 6.7657e-02,  6.3496e-02,  4.8713e-02,  7.9305e-02,  3.7448e-02,
          2.6528e-03,  3.9375e-02, -7.0985e-03,  5.9361e-02,  3.1537e-02,
          6.0098e-02, -5.2905e-02,  4.0607e-02, -2.5931e-02,  2.9843e-02,
          1.1269e-03,  7.3515e-02, -5.0382e-02, -1.2239e-01,  2.3703e-02,
          2.9727e-02,  4.2477e-02,  2.5634e-02,  1.9952e-03, -5.6919e-02,
         -2.7160e-02, -3.2904e-02,  6.6025e-02,  1.1901e-01, -4.5879e-02,
         -7.2621e-02, -3.2584e-02,  5.2341e-02,  4.5055e-02,  8.2530e-03,
          3.6702e-02, -1.3942e-02,  6.5392e-02, -2.6427e-02,  2.0643e-04,
         -1.3664e-02, -3.6281e-02, -1.9504e-02, -2.8974e-02,  3.9427e-02,
         -8.8409e-02,  2.6243e-03,  1.3671e-02,  4.8306e-02, -3.1157e-02,
         -1.1733e-01, -5.1169e-02, -8.8529e-02, -2.1896e-02,  1.4299e-02,
          4.4417e-02, -1.3482e-02,  7.4339e-02,  2.6638e-02, -1.9876e-02,
          1.7919e-02, -1.0605e-02, -9.0426e-02,  2.1327e-02,  1.4120e-01,
         -6.4718e

In [3]:
pip install faiss-cpu

Collecting faiss-cpuNote: you may need to restart the kernel to use updated packages.

  Downloading faiss_cpu-1.12.0-cp312-cp312-win_amd64.whl.metadata (5.2 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-win_amd64.whl (18.2 MB)
   ---------------------------------------- 0.0/18.2 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.2 MB 8.5 MB/s eta 0:00:03
   ------- -------------------------------- 3.4/18.2 MB 10.1 MB/s eta 0:00:02
   ------------ --------------------------- 5.5/18.2 MB 10.2 MB/s eta 0:00:02
   ---------------- ----------------------- 7.6/18.2 MB 10.0 MB/s eta 0:00:02
   --------------------- ------------------ 9.7/18.2 MB 10.1 MB/s eta 0:00:01
   -------------------------- ------------- 12.1/18.2 MB 10.2 MB/s eta 0:00:01
   ------------------------------- -------- 14.2/18.2 MB 10.2 MB/s eta 0:00:01
   ------------------------------------ --- 16.5/18.2 MB 10.4 MB/s eta 0:00:01
   ---------------------------------------- 18.2/18.2 MB 10.0 MB/s eta 0:00:0

In [4]:
import faiss
import numpy as np
dimension = 128 #size of vector embeddings
num_vectors = 100 #number of vectors
vectors = np.random.rand(num_vectors,dimension).astype('float32')

index = faiss.IndexFlatL2(dimension)

index.add(vectors)

query_vectors = np.random.rand(1,dimension).astype('float32')
distances, indices = index.search(query_vectors,k=5) #here top 5 vectors are found

print("closest distances", distances)
print("closest vector indices", indices)


closest distances [[17.472698 17.618958 18.009983 18.14658  18.303402]]
closest vector indices [[42 14 48 75 25]]


In [5]:
len(vectors[0])

128

two different techniques to find similarity:
->semantic search
->cosine similarity

In [6]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-win_amd64.whl.metadata (9.0 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sd

In [7]:
import chromadb

client = chromadb.PersistentClient(path = "./chroma_data")



In [8]:
collection = client.get_or_create_collection(name = "rag_collection")

In [9]:
print(client.list_collections())

[Collection(name=rag_collection)]


In [10]:
collection.add(
    documents=[
        # Topic 1: Space Exploration
        "The Hubble Space Telescope has been orbiting Earth since 1990.",
        "NASA’s Artemis program aims to return humans to the Moon.",
        "Mars rovers like Perseverance are searching for signs of past life.",

        # Topic 2: Ancient History
        "The pyramids of Giza were built during Egypt’s Old Kingdom.",
        "The Roman Empire reached its peak under Emperor Trajan.",
        "The Indus Valley Civilization thrived around 2500 BCE.",

        # Topic 3: Modern Technology
        "Quantum computing leverages qubits to perform complex calculations.",
        "5G networks offer faster speeds and lower latency than 4G.",
        "Artificial Intelligence is transforming industries worldwide.",

        # Topic 4: Food & Cooking
        "Italian pasta comes in many shapes like penne, fusilli, and spaghetti.",
        "Sushi originated in Japan as a method of preserving fish.",
        "Sourdough bread is made using natural fermentation.",

        # Topic 5: Sports
        "The Olympics are held every four years with summer and winter editions.",
        "Football is the most popular sport globally, followed by cricket.",
        "Tennis Grand Slam tournaments include Wimbledon and the US Open."
    ],
    ids=[
        # Topic 1 IDs
        "doc1", "doc2", "doc3",
        # Topic 2 IDs
        "doc7", "doc8", "doc9",
        # Topic 3 IDs
        "doc10", "doc11", "doc12",
        # Topic 4 IDs
        "doc13", "doc14", "doc15",
        # Topic 5 IDs
        "doc16", "doc17", "doc18"
    ],
    metadatas=[
        # Topic 1 Metadata
        {"source": "space"}, {"source": "space"}, {"source": "space"},
        # Topic 2 Metadata
        {"source": "history"}, {"source": "history"}, {"source": "history"},
        # Topic 3 Metadata
        {"source": "technology"}, {"source": "technology"}, {"source": "technology"},
        # Topic 4 Metadata
        {"source": "food"}, {"source": "food"}, {"source": "food"},
        # Topic 5 Metadata
        {"source": "sports"}, {"source": "sports"}, {"source": "sports"}
    ]
)


C:\Users\Jhanvi\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [01:56<00:00, 712kiB/s]   


In [12]:
results = collection.query(
    query_texts = [" sushi?"],
    n_results = 2
)

print(results)

{'ids': [['doc14', 'doc13']], 'embeddings': None, 'documents': [['Sushi originated in Japan as a method of preserving fish.', 'Italian pasta comes in many shapes like penne, fusilli, and spaghetti.']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'source': 'food'}, {'source': 'food'}]], 'distances': [[0.5948542952537537, 1.5998075008392334]]}


In [13]:
collection.update(
    ids = ["doc2"],
    documents = ["ChromaDB is often used for building AI/LLM apps with embeddings."]
)

In [None]:
#delete - collection.delete(ids = ["doc3"]) 
