# Vector Store Implementation

This is an implementation of a vector store that can leverage embedding models to create our vectors

### Imports

- Numpy
- Pands

##### Using files from:

http://ir.dcs.gla.ac.uk/resources/test_collections/time/

In [1]:
import numpy as np
import pandas as pd

### Index (Helper class)

Implementation of an Helper class index which is going to be used in my vector Store

In [None]:
class Index:
    def __init__(self, dim = None):
        self.dim = dim

    def add_items(self, vector, vector_id) -> None:
        """
        Update the indexing structure for the vector DB
        """
        for existing_id, existing_vector in self.stored_vectors.items():
            # Compute the dot product of the vectors
            similarity = np.dot(vector, existing_vector)
            if existing_id not in self.similarity_index:
                self.similarity_index[existing_id] = {}
            
            self.similarity_index[existing_id][vector_id] = similarity


    def _compute_similarity(self, query_vector, vector) -> float:
        """
        Compute the similarity between two vectors

        Args:
            query_vector (numpy.ndarray): The query vector
            vector (numpy.ndarray): The vector to compare

        Returns:
            The dot product of the vectors, normalized by the product of their norms
        """
        return np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))

### Vector Store class

In [4]:
class VectorStore:
    def __init__(self, vector_dimension = None, metric="cosine"):

        if vector_dimension is None:
            print("You should pass the vector size")

        # Initialize our index our index
        self.index = Index(dim=vector_dim)
            
        # Vector dimension 
        self.vector_dimension = vector_dimension
        # Dictionary to store the vectors
        self.stored_vectors = {}
        # A dictionary for indexing structure for retrieval
        self.similarity_index = {}
        # Counter to then store the ids of vectors
        self.id_counter : int = 0
        # Dictionary to store sentences corresponding to vectors
        self.sentences = {}


    def _load_vector_store(self, persist_path="vector_store"):
        pass

    
    def update_vector_store(self, new_sentence_vectors, persist_path="vector_store"):
        """
        Update the existing vector store with new vectors

        new_id_vectors: Dictionary containing new vectors to be added
        persist_path: Path to the directory where the existing vector store is saved
        """
        try:
            # load existing index and sentences
            self.index, self.sentences = self._load_vector_store(persist_path)

            # Update the id counter
            self.id_counter = max(self.sentences.keys()) + 1

            # Add new vectors to the index and sentences
            vectors = []
            ids = []
            for sentence, vector in new_sentence_vectors.items():
                vectors.append(vector)
                ids.append(self.id_counter)
                self.sentences[self.id_counter] = sentence
                self.id_counter += 1
            self.index.add_items(vectors, ids)
            print("Vector store updated successfully", end="\n\n")
        except Exception as e:
            raise e


    def create_vector_store(self, new_sentence_vectors, persist=True, persist_path = "vector_store"):
        """
        Add vectors to the vector store

        id: the unique id for the vector
        vecotor: the vector to be added
        """
        try:
            vectors = []
            ids = []
            for sentence, vector in new_sentence_vectors.items():
                # Append the new vector
                vectors.append(vector)
                # Assing a unique integer id to every vecotr
                ids.append(self.id_counter)
                # Store the sentence
                self.sentences[self.id_counter] = sentence
                # Incremeant the counter for the next vector
                self.id_counter += 1

            # Adding the items to the index
            self.index.add_items(vectos, ids)
    
            if persist:
                # Create the directory if it doesn't exist
                os.makedirs(persist_path, exist_ok=True)
    
                # Serialize and save the index
                #  with open(os.path.join(persist_path, "index.pkl"), "wb") as f:
                #     pickle.dump(self.index, f)
    
                # Serialize and save the sentences
                # with open(os.path.join(persist_path, "sentences.pkl"), "wb") as f:
                #    pickle.dump(self.sentences, f)
    
            print("Vector store created successfully", end="\n\n")
            
        except Exception as e:
            raise e

    def get_similar_vectors(self, query_vector, top_n=5, persist_path="vector_store"):
        """
        Get the most similar vectors to the given vector

        query_vector: the vector to compare with the vectors in the store
        top_n: the number of similar vectors to return
        persist_path: Path to the directory where the existing vector store is saved

        return: the most similar vectors
        """
        # load existing index and sentences
        self.index, self.sentences = self._load_vector_store(persist_path)

        # Search for similar vectors and return the sentences and similarity scores
        labels, distances = self.index.knn_query(np.array([query_vector]), k=top_n)
        similar_vectors = [
            (self.sentences[label], distance)
            for label, distance in zip(labels[0], distances[0])
        ]
        return similar_vectors

    def get_similar_vectors(self, query_vector, num_results=5, persist_path="vector_store") -> list:
        """
        Find similar vectors to the query vector

        Args:
            query_vector (numpy.ndarray): The query vector to compare with the vecotr in the store
            num_results (int): The number of similar vectors to return
            persist_path: Path to the directory where the existing vector store is saved

        Returns:
            A list of tuples, each containing a vector id and its similarity to the query vector
        """
        # load existing index and sentences
        # self.index, self.sentences = self._load_vector_store(persist_path)

        
        result = []
        labels, distances = self.index.knn_query(np.array([query_vector]), k=top_n)
        self.index.knn_query(np.array([query_vector]), k=top_n)

        similar_vectors = [(self.sentences[label], distance) for label, distance in zip(labels[0], distances[0])]
        
        return similar_vectors

       """ 
        for vector_id, vector in self.stored_vectors.items():
            similarity = self._compute_similarity(query_vector, vector)
            result.append((vector_id, similarity))
            
        # Sort by the similarity in descending order
        result.sort(key=lambda x: x[1], reverse=True)
        
        # Return the num_results most similar ones
        return result[:num_results]
        """"

### Demo of The vector store

Using nomic embed for the demo and a custom index

In [7]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the Embedding Model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

# Get data from csv
data = pd.read_csv("data.csv", delimiter=";")

# Take 80% of the data to create the vector store, remaining 20% to update the vector store
# To showcase the ability of updating the vector store
data_1 = data.sample(frac=0.8, random_state=200)
data_2 = data.drop(data_1.index)

# Convert data to list
# Instead of doing this I could decide to split documents in a smarter way with overlap or something

data_1 = data_1["text"].tolist()

# Encode the data, have data and vector in dictionary, data keys, vector values
vectors_1 = model.encode(data_1)

# Get the vector dimension
vector_dimension = len(vectors_1[0])

# Create a dictionary with id and vectors
new_sentence_vectors_1 = {data_1[i]: vectors_1[i] for i in range(len(data_1))}

# Create a vector store. Select metric 'cosine' is the one I implemented (other possibility 'l1' or 'ip')
vector_store = VectorStore(vector_dimension, metric="cosine")

# Create the vector store, set persist to True to save the vector store on disk
# Implment persitence in the future
vector_store.create_vector_store(new_sentence_vectors_1, persist=False)

# Query the vector store
query = "I want to buy a car"
query_vector = model.encode(query)
similar_vectors = vector_store.get_similar_vectors(query_vector, num_results=5)


# Visualize the most similar vectors
print("Similarity Vectors:")
for sentence, similarity_score in similar_vectors:
    print(f"- Sentence: {sentence}")
    print(f"  Similarity Score: {similarity_score}")
    print()

<All keys matched successfully>


['And as the sun rises once again, the cycle begins anew, a testament to the beauty and resilience of life.', 'The sleek, silver sports car raced down the winding mountain road, its engine roaring with power.', 'In every corner of the world, life goes on, a tapestry of moments both ordinary and extraordinary.', 'Families gather around tables, sharing meals and stories, laughter and love.', 'At night, the stars come out to play, dotting the sky with their shimmering brilliance.', 'A sleek, black sportbike hugged the curves of the mountain road, its rider leaning into each turn with precision.', 'As evening approaches, the sky transforms into a canvas of vibrant hues, painting a breathtaking sunset.', 'Nearby, a family enjoys a picnic, sharing sandwiches, fruits, and homemade lemonade.', 'A group of friends revved their engines, ready to hit the open road and leave the city behind.', "In quieter moments, individuals find solace in the city's hidden gems and peaceful corners.", 'Doctors a

### Adding update of the vector store

In [None]:
# Update the vector store
data_2 = data_2["text"].tolist()
vectors_2 = model.encode(data_2)
new_sentence_vectors_2 = {data_2[i]: vectors_2[i] for i in range(len(data_2))}
vector_store.update_vector_store(new_sentence_vectors_2)

# Query the vector store
query = "I want to buy a cycle"
query_vector = model.encode(query)
similar_vectors = vector_store.get_similar_vectors(query_vector, top_n=5)
print("Similarity Vectors:")
for sentence, similarity_score in similar_vectors:
    print(f"- Sentence: {sentence}")
    print(f"  Similarity Score: {similarity_score}")
    print()

### Adding persistency

### Old way of using it

Might be helpfull as a look up

In [None]:
# Create a VectorStore instance
vector_store = VectorStore()

# Define your sentences
sentences = [
    "I eat mango",
    "mango is my favorite fruit",
    "mango, apple, oranges are fruits",
    "fruits are good for health",
]

# Tokenization and Vocabulary Creation
vocabulary = set()
for sentence in sentences:
    tokens = sentence.lower().split()
    vocabulary.update(tokens)

# Assign unique indices to words in the vocabulary
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Vectorization
sentence_vectors = {}
for sentence in sentences:
    tokens = sentence.lower().split()
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        vector[word_to_index[token]] += 1
    sentence_vectors[sentence] = vector

# Storing in VectorStore
for sentence, vector in sentence_vectors.items():
    vector_store.add_vector(sentence, vector)

# Searching for Similarity
query_sentence = "Mango is the best fruit"
query_vector = np.zeros(len(vocabulary))
query_tokens = query_sentence.lower().split()
for token in query_tokens:
    if token in word_to_index:
        query_vector[word_to_index[token]] += 1

similar_sentences = vector_store.find_similar_vectors(query_vector, num_results=2)

# Print similar sentences
print("Query Sentence:", query_sentence)
print("Similar Sentences:")
for sentence, similarity in similar_sentences:
    print(f"{sentence}: Similarity = {similarity:.4f}")