## vector database

This notebook presents how to use embeddings and store them in vector database.

In [1]:
!pip install faiss-cpu



### create embedding

In [2]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")
print(model.max_seq_length)

model.max_seq_length = 256

# Our sentences we like to encode
sentences = [
    "dinosaurs live in africa but in different time dimension", 
    "this is sentence about little cat that liked to eat tomatoes",
    "this is the another sample sentence which is here just to not be matched while other one is"
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences, normalize_embeddings=True)

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

256


In [3]:
embeddings

array([[-0.04987689,  0.03634831,  0.01747593, ..., -0.05154558,
         0.01327899, -0.05160029],
       [ 0.06277531,  0.07880413,  0.01862673, ...,  0.18015605,
         0.07854404,  0.01058723],
       [-0.01920931,  0.06346301,  0.07642581, ...,  0.01450099,
         0.08586992, -0.00456652]], dtype=float32)

### create vector DB and load documents

In [8]:
import faiss

d = 384  # dimension

# Build index
index = faiss.IndexFlatL2(d)  # build the index
index.add(embeddings)  # add vectors to the index

### perform search

In [31]:
queryText = "french fries"
embeddingSearch = model.encode([queryText], normalize_embeddings=True)
embeddingFound, idx = index.search(embeddingSearch, 1)  # actual search
print(queryText + " matches:\n" + sentences[idx[0][0]])

queryText = "not similar text"
embeddingSearch = model.encode([queryText], normalize_embeddings=True)
embeddingFound, idx = index.search(embeddingSearch, 1)  # actual search
print(queryText + " matches:\n" + sentences[idx[0][0]])

french fries matches:
this is sentence about little cat that liked to eat tomatoes
not similar text matches:
this is the another sample sentence which is here just to not be matched while other one is
