# Vector Database from scratch
A vector database is a collection of data stored as mathematical representations. Vector databases make it easier for machine learning models to remember previous inputs, allowing machine learning to be used to power search, recommendations, and text generation use-cases. Data can be identified based on similarity metrics instead of exact matches, making it possible for a computer model to understand data contextually.

In [4]:
import torch

In [47]:
from collections import defaultdict
import hashlib as cryptolib
import numpy as np
'''
The VectorDB class is the core class which will contain the documents as well as indexing mechanism for
faster retrieval. and the class will also provide methods for similarity checking
Documents are largely text documents embedded as vectors.
Structure for documents:
document {
  "id",
  "text_embedding"
}

'''
class VectorDB:

  def __init__(self):
    self.vector_documents = {}
    self.document_ids = set()

  def add_document(self,document):
    # print("adding",document)
    doc_id, doc_embedding = document['id'],document['text_embedding']
    # print(doc_id,doc_embedding)
    if doc_id not in self.document_ids:
      self.document_ids.add(doc_id)
      self.vector_documents[doc_id] = doc_embedding
    else:
      # if text is altered the hash is different, so ig no need of this?
      # update = input("Document already exists do you want to update?(y/n)")
      # if update == 'y':
      #   self.vector_documents[doc_id] = doc_embedding
      # else:
      print(f"Document with {doc_id} already exists in Database, Skipping")


  def create_document(self,text,embedding):
    document_hash = cryptolib.md5(text.encode('utf-8')).hexdigest()
    new_document = {
        'id': document_hash,
        'text_embedding' : embedding
    }
    # print(document_hash,text)
    self.add_document(new_document)

  def find_similarity(self,embedding1,embedding2):
    # we will use cosine similarity for now
    cos_similarity_score = np.dot(embedding1,embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
    return cos_similarity_score

  def find_similar_documents(self,query_document,similarity_threshold=0.5):
    similar_docs = []
    # This part need optimization for scalability
    # currently it searches entire DB linearly which is very slow
    for id,text_embedding in self.vector_documents.items():

      similarity = self.find_similarity(query_document['text_embedding'],text_embedding.flatten())
      if similarity > similarity_threshold:# 0.5 is arbitrary right now change later
        similar_docs.append((id,similarity))
    # Sort in descending order of similarity
    similar_docs.sort(key=lambda x:x[1],reverse=True)
    return similar_docs
  def __len__(self):
      return len(self.vector_documents)


In [48]:
myDB = VectorDB()


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer as tfidfV


In [50]:
docs = ["hello world!","ello Sam!","Sam is part of world!","ello Sameer!","hellow Sam!"]
docs2 = ["ello world!"]
my_vec = tfidfV()
my_vec.fit(docs)
embedding = my_vec.transform(docs2)
embedding.data

array([0.70710678, 0.70710678])

In [51]:
docs2[0]

'ello world!'

In [52]:
myDB.create_document(docs2[0],torch.tensor(embedding.toarray()))

In [53]:
torch.tensor(embedding.toarray()).flatten().dim()

1

In [54]:
print(myDB.vector_documents)

{'6e9bc20c0d778a0740d3e0396689909b': tensor([[0.7071, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7071]],
       dtype=torch.float64)}


In [55]:
print(myDB.find_similarity(torch.tensor(embedding.toarray()).flatten(),torch.tensor(embedding.toarray()).flatten()))

1.0000000000000002


In [56]:
for doc,embed in zip(docs,my_vec.transform(docs).toarray()):
  # print(tor.tensor(embed))
  myDB.create_document(doc,torch.tensor(embed))



In [57]:
myDB.vector_documents

{'6e9bc20c0d778a0740d3e0396689909b': tensor([[0.7071, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7071]],
        dtype=torch.float64),
 'fc3ff98e8c6a0d3087d515c0473f8677': tensor([0.0000, 0.7783, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6279],
        dtype=torch.float64),
 'ad47f4f07b6631c98d9fd9af32dc4257': tensor([0.7694, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6387, 0.0000, 0.0000],
        dtype=torch.float64),
 '12858b34d0a1ae7f62cab1726b8304c6': tensor([0.0000, 0.0000, 0.0000, 0.4939, 0.4939, 0.4939, 0.3308, 0.0000, 0.3985],
        dtype=torch.float64),
 'ed42f068740c5dc1b4071dd04c657444': tensor([0.6279, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7783, 0.0000],
        dtype=torch.float64),
 '40b72a3c4586ce1b6d5cc704161d2422': tensor([0.0000, 0.0000, 0.8309, 0.0000, 0.0000, 0.0000, 0.5565, 0.0000, 0.0000],
        dtype=torch.float64)}

In [58]:
input_list = ["ello Sam!"]

In [59]:
query = {'id':1,'text_embedding':torch.tensor(my_vec.transform(input_list).toarray()).flatten()}

In [60]:
query['text_embedding'].dim()

1

In [61]:
print(myDB.find_similar_documents(query))

[('ad47f4f07b6631c98d9fd9af32dc4257', 1.0), ('6e9bc20c0d778a0740d3e0396689909b', 0.5440812430630018)]


In [62]:
for id,embedding in myDB.vector_documents.items():
  print(my_vec.inverse_transform(embedding.reshape(-1,1)))

[array(['ello'], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array(['ello'], dtype='<U6')]
[array([], dtype='<U6'), array(['ello'], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array(['ello'], dtype='<U6')]
[array(['ello'], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array(['ello'], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6')]
[array([], dtype='<U6'), array([], dtype='<U6'), array([], dtype='<U6'), array(['ello'], dtype='<U6'), array(['ello'], dtype='<U6'), array(['ello'], dtype='<U6'), array(['ello'], dtype='<U6'), array([], dtype='<U6'), array(['ello'], dtype='<U6')]
[array(['ello'], dtype='<U6'), array([], dtype='<U6'), array([], d

In [30]:
myDB.vector_documents.get('ad47f4f07b6631c98d9fd9af32dc4257')

tensor([0.7694, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6387, 0.0000, 0.0000],
       dtype=torch.float64)

In [46]:
my_vec.decode(myDB.vector_documents.get('ad47f4f07b6631c98d9fd9af32dc4257').reshape(-1,1))

tensor([[0.7694],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.6387],
        [0.0000],
        [0.0000]], dtype=torch.float64)