# Vector Store Implementation

This is an implementation of a vector store that can leverage embedding models to create our vectors

### Imports

- Numpy
- Pands

##### Using files from:

http://ir.dcs.gla.ac.uk/resources/test_collections/time/

In [1]:
import numpy as np
import pandas as pd
import pickle
import os

### Index (Helper class)

Implementation of an Helper class index which is going to be used in my vector Store

In [2]:
class Index:
    def __init__(self, dim = None):
        self.dim = dim
        
        # Dictionary to store the vectors
        self.stored_vectors = {}

    def add_items(self, vectors, vectors_id) -> None:
        """
        Update the indexing structure for the vector store
        """
        for vector_id, vector in zip(vectors_id, vectors):
            self.stored_vectors[vector_id] = vector

    def find_similar_vectors(self, query_vector, top_n):
        """
        Compute the top n similar vectors to the query vector

        Args:
            query_vector (numpy.ndarray): The query vector
            top_n (int): The number of top similar vectors to return

        Returns:
            A list of the top n similar vectors
        """
        # Add option for different similarity measures
        similarities = self._cosine_similarity(query_vector)
        
        top_n_indices = np.argsort(similarities[0])[-top_n:][::-1]
        return top_n_indices
        # return [self.vectors[i] for i in top_n_indices]

    def _cosine_similarity(self, query_vector) -> float:
        """
        Compute the similarity between two vectors

        Args:
            query_vector (numpy.ndarray): The query vector
            vector (numpy.ndarray): The vector to compare

        Returns:
            The dot product of the vectors, normalized by the product of their norms
        """

        vectors = np.array(list(self.stored_vectors.values()))
        
        dot_product = np.dot(query_vector, vectors.T)
        query_vector_norm = np.linalg.norm(query_vector)
        vectors_norm = np.linalg.norm(vectors, axis=1)
        similarities = dot_product / (query_vector_norm * vectors_norm)
        
        return similarities 

### Vector Store class

In [3]:
class VectorStore:
    def __init__(self, vector_dimension = None, persist=True, persist_path="vector_store", metric="cosine"):

        if vector_dimension is None:
            print("You should pass the vector size")

        # Initialize our index our index
        self.index = Index(dim=vector_dimension)

        # Persistence
        self.persist = persist
        self.persist_path = persist_path
        
        self.vector_dimension = vector_dimension
        
        # A dictionary for indexing structure for retrieval
        self.similarity_index = {}
        
        # Counter to then store the ids of vectors
        self.id_counter : int = 0
        
        # Dictionary to store sentences corresponding to vectors
        self.sentences = {}


    def _load_vector_store(self):
        index_file = os.path.join(self.persist_path, "index.pkl")
        sentences_file = os.path.join(self.persist_path, "sentences.pkl")
        
        if not (os.path.exists(index_file) and os.path.exists(sentences_file)):
            raise FileNotFoundError("Index and sentences files not found in the specified directory.")

        with open(index_file, "rb") as f:
            self.index = pickle.load(f)
        with open(sentences_file, "rb") as f:
            self.sentences = pickle.load(f)

        return self.index, self.sentences

    def _save_vector_store(self):
        """
        Save the index and corresponding sentences
        """
        # Create the directory if it doesn't exist
        os.makedirs(self.persist_path, exist_ok=True)
        
        # Serialize and save the index
        with open(os.path.join(self.persist_path, "index.pkl"), "wb") as f:
            pickle.dump(self.index, f)

        # Serialize and save the sentences
        with open(os.path.join(self.persist_path, "sentences.pkl"), "wb") as f:
            pickle.dump(self.sentences, f)
    
    def update_vector_store(self, new_sentence_vectors):
        """
        Update the existing vector store with new vectors

        new_id_vectors: Dictionary containing new vectors to be added
        persist_path: Path to the directory where the existing vector store is saved
        """
        try:
            # load existing index and sentences
            self.index, self.sentences = self._load_vector_store()

            # Update the id counter
            self.id_counter = max(self.sentences.keys()) + 1

            # Add new vectors to the index and sentences
            vectors = []
            ids = []
            for sentence, vector in new_sentence_vectors.items():
                vectors.append(vector)
                ids.append(self.id_counter)
                self.sentences[self.id_counter] = sentence
                self.id_counter += 1
                
            # Adding the vectors, idnex to the our index
            self.index.add_items(vectors, ids)
            
            print("Vector store updated successfully", end="\n\n")
        except Exception as e:
            raise e

    def create_vector_store(self, new_sentence_vectors):
        """
        Add vectors to the vector store

        id: the unique id for the vector
        vecotor: the vector to be added
        """
        try:
            vectors = []
            ids = []
            for sentence, vector in new_sentence_vectors.items():
                # Append the new vector
                vectors.append(vector)
                # Assing a unique integer id to every vecotr
                ids.append(self.id_counter)
                # Store the sentence
                self.sentences[self.id_counter] = sentence
                # Incremeant the counter for the next vector
                self.id_counter += 1

            # Adding the items to the index
            self.index.add_items(vectors, ids)
    
            if self.persist:       
                 self._save_vector_store()
                
            print("Vector store created successfully", end="\n\n")
            
        except Exception as e:
            raise e

    def get_similar_vectors(self, query_vector, num_results=5) -> list:
        """
        Find similar vectors to the query vector

        Args:
            query_vector (numpy.ndarray): The query vector to compare with the vecotr in the store
            num_results (int): The number of similar vectors to return
            persist_path: Path to the directory where the existing vector store is saved

        Returns:
            A list of tuples, each containing a vector id and its similarity to the query vector
        """
        # load existing index and sentences
        self.index, self.sentences = self._load_vector_store()
        
        result = []
        # labels, distances = self.index.find_similar_vectors(query_vector, k=top_n)
        labels = self.index.find_similar_vectors(query_vector, top_n=num_results)

        return labels
        # similar_vectors = [(self.sentences[label], distance) for label, distance in zip(labels[0], distances[0])]
        
        """ 
        for vector_id, vector in self.stored_vectors.items():
            similarity = self._compute_similarity(query_vector, vector)
            result.append((vector_id, similarity))
            
        # Sort by the similarity in descending order
        result.sort(key=lambda x: x[1], reverse=True)
        
        # Return the num_results most similar ones
        return result[:num_results]
        """
        
        # return similar_vectors

### Demo of The vector store

Using nomic embed for the demo and a custom index

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the Embedding Model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

# Get data from csv
data = pd.read_csv("data.csv", delimiter=";")

# Take 80% of the data to create the vector store, remaining 20% to update the vector store
# To showcase the ability of updating the vector store
data_1 = data.sample(frac=0.8, random_state=200)
data_2 = data.drop(data_1.index)

# Convert data to list
# Instead of doing this I could decide to split documents in a smarter way with overlap or something

data_1 = data_1["text"].tolist()

# Encode the data, have data and vector in dictionary, data keys, vector values
vectors_1 = model.encode(data_1)

# Get the vector dimension
vector_dimension = len(vectors_1[0])

# Create a dictionary with id and vectors
new_sentence_vectors_1 = {data_1[i]: vectors_1[i] for i in range(len(data_1))}

# Create a vector store. Select metric 'cosine' is the one I implemented (other possibility 'l1' or 'ip')
vector_store = VectorStore(vector_dimension, metric="cosine")

# Create the vector store, set persist to True to save the vector store on disk
# Implment persitence in the future
vector_store.create_vector_store(new_sentence_vectors_1)

# Query the vector store
query = "I want to buy a car"
query_vector = model.encode(query)
# similar_vectors = vector_store.get_similar_vectors(query_vector, num_results=5)
index = vector_store.get_similar_vectors(query_vector, num_results=5)


"""
# Visualize the most similar vectors
print("Similarity Vectors:")
for sentence, similarity_score in similar_vectors:
    print(f"- Sentence: {sentence}")
    print(f"  Similarity Score: {similarity_score}")
    print()
"""
print(index)

<All keys matched successfully>


Vector store created successfully

[0]


### Adding update of the vector store

In [5]:
# Update the vector store
data_2 = data_2["text"].tolist()
vectors_2 = model.encode(data_2)
new_sentence_vectors_2 = {data_2[i]: vectors_2[i] for i in range(len(data_2))}
vector_store.update_vector_store(new_sentence_vectors_2)

# Query the vector store
query = "I want to buy a cycle"
query_vector = model.encode(query)
similar_vectors = vector_store.get_similar_vectors(query_vector, top_n=5)
print("Similarity Vectors:")
for sentence, similarity_score in similar_vectors:
    print(f"- Sentence: {sentence}")
    print(f"  Similarity Score: {similarity_score}")
    print()

NameError: name 'persist_path' is not defined

### Adding persistency

### Evaluate the system on a set of test queries.

### Old way of using it

Might be helpfull as a look up

In [None]:
# Create a VectorStore instance
vector_store = VectorStore()

# Define your sentences
sentences = [
    "I eat mango",
    "mango is my favorite fruit",
    "mango, apple, oranges are fruits",
    "fruits are good for health",
]

# Tokenization and Vocabulary Creation
vocabulary = set()
for sentence in sentences:
    tokens = sentence.lower().split()
    vocabulary.update(tokens)

# Assign unique indices to words in the vocabulary
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Vectorization
sentence_vectors = {}
for sentence in sentences:
    tokens = sentence.lower().split()
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        vector[word_to_index[token]] += 1
    sentence_vectors[sentence] = vector

# Storing in VectorStore
for sentence, vector in sentence_vectors.items():
    vector_store.add_vector(sentence, vector)

# Searching for Similarity
query_sentence = "Mango is the best fruit"
query_vector = np.zeros(len(vocabulary))
query_tokens = query_sentence.lower().split()
for token in query_tokens:
    if token in word_to_index:
        query_vector[word_to_index[token]] += 1

similar_sentences = vector_store.find_similar_vectors(query_vector, num_results=2)

# Print similar sentences
print("Query Sentence:", query_sentence)
print("Similar Sentences:")
for sentence, similarity in similar_sentences:
    print(f"{sentence}: Similarity = {similarity:.4f}")