# Vector Store Implementation

This is an implementation of a vector store that can leverage embedding models to create our vectors

### Imports

- Numpy
- Pands

##### Using files from:

http://ir.dcs.gla.ac.uk/resources/test_collections/time/

In [1]:
import numpy as np
import pandas as pd
import pickle
import os

### Index (Helper class)

Implementation of an Helper class index which is going to be used in my vector Store

In [2]:
class Index:
    def __init__(self, dim=None):
        self.dim = dim
        
        # Dictionary to store the vectors
        self.stored_vectors = {}

    def add_items(self, vectors, vectors_id: int):
        """
        Update the indexing structure for the vector store
        """
        for vector_id, vector in zip(vectors_id, vectors):
            if vector.shape != (self.dim,):
                raise ValueError("Vectors must have shape (dim,)")
            self.stored_vectors[vector_id] = vector

    def knn_query(self, query_vector: np.ndarray, top_n: int = 5):
        """
        Find the top n similar vectors to the query vector using cosine similarity.

        Args:
            query_vector (numpy.ndarray): The query vector.
            top_n (int): The number of top similar vectors to return.

        Returns:
            A tuple of two numpy arrays: the first array contains the indices of the top n similar vectors,
            and the second array contains the corresponding cosine similarity scores.
        """
        similarities = [(index, self._cosine_similarity(query_vector, vector)) for index, vector in self.stored_vectors.items()]

        # Sort based on the similarity (second element of the vector) and take the first top_n elements
        # Then unpack it into indices and distances
        top_n_indices, top_n_similarities = zip(*sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n])

        return top_n_indices, top_n_similarities
        
    def _cosine_similarity(self, query_vector, vector) -> float:
        """
        Compute the similarity between two vectors

        Args:
            query_vector (numpy.ndarray): The query vector
            vector (numpy.ndarray): The vector to compare

        Returns:
            The dot product of the vectors, normalized by the product of their norms
        """

        dot_product = np.dot(query_vector, vector)
        
        query_vector_norm = np.linalg.norm(query_vector)
        vector_norm = np.linalg.norm(vector)

        # Return the similarity
        return dot_product / (query_vector_norm * vector_norm)

### Vector Store class

In [3]:
class VectorStore:
    def __init__(self, vector_dimension=None, persist=True, persist_path="vector_store"):

        if vector_dimension is None:
            raise ValueError("You should pass the vector size")

        # Initialize our index our index
        self.index = Index(dim=vector_dimension)
        self.persist = persist
        self.persist_path = persist_path
        self.vector_dimension = vector_dimension
    
        # Counter to then store the ids of vectors
        self.id_counter = 0
        
        # Dictionary to store sentences corresponding to vectors
        self.sentences = {}

    def _load_vector_store(self):
        index_file = os.path.join(self.persist_path, "index.pkl")
        sentences_file = os.path.join(self.persist_path, "sentences.pkl")
        
        if not os.path.exists(index_file) or not os.path.exists(sentences_file):
            raise FileNotFoundError("Index and sentences files not found in the specified directory.")

        with open(index_file, "rb") as f:
            self.index = pickle.load(f)
        with open(sentences_file, "rb") as f:
            self.sentences = pickle.load(f)

        return self.index, self.sentences

    def save_vector_store(self):
        # Save the index and corresponding sentences
        
        # Create the directory if it doesn't exist
        os.makedirs(self.persist_path, exist_ok=True)
        
        # Serialize and save the index
        with open(os.path.join(self.persist_path, "index.pkl"), "wb") as f:
            pickle.dump(self.index, f)

        # Serialize and save the sentences
        with open(os.path.join(self.persist_path, "sentences.pkl"), "wb") as f:
            pickle.dump(self.sentences, f)
            
    def create_vector_store(self, new_sentence_vectors):
        """
        Add vectors to the vector store

        id: the unique id for the vector
        vecotor: the vector to be added
        """
        try:
            vectors = []
            ids = []
            for sentence, vector in new_sentence_vectors.items():
                # Append the new vector
                vectors.append(vector)
                # Assign a unique integer id to every vector
                ids.append(self.id_counter)
                # Store the sentence
                self.sentences[self.id_counter] = sentence
                # Increment the counter for the next vector
                self.id_counter += 1

            # Adding the items to the index
            self.index.add_items(vectors, ids)
    
            if self.persist:                
                self.save_vector_store()
                
            print("Vector store created successfully", end="\n\n")
            
        except Exception as e:
            raise e

    def update_vector_store(self, new_sentence_vectors):
        """
        Update the existing vector store with new vectors

        new_id_vectors: Dictionary containing new vectors to be added
        persist_path: Path to the directory where the existing vector store is saved
        """
        try:
            # Load existing index and sentences
            self.index, self.sentences = self._load_vector_store()

            # Update the id counter
            self.id_counter = max(self.sentences.keys()) + 1

            # Add new vectors to the index and sentences
            vectors = []
            ids = []
            for sentence, vector in new_sentence_vectors.items():
                vectors.append(vector)
                ids.append(self.id_counter)
                self.sentences[self.id_counter] = sentence
                self.id_counter += 1
                
            # Adding the vectors, index to the our index
            self.index.add_items(vectors, ids)
            
            print("Vector store updated successfully", end="\n\n")
        except Exception as e:
            raise e

    def delete_vector_store(self) -> None:
        """
        Delete a persistent vector store that was craeted
        """
        
        try:
            # Check if the directory exists
            if os.path.exists(self.persist_path):
                # Delete index and sentences files
                os.remove(os.path.join(self.persist_path, "index.pkl"))
                os.remove(os.path.join(self.persist_path, "sentences.pkl"))
                print("Vector store deleted successfully", end="\n\n")
            else:
                print("Vector store does not exist", end="\n\n")
        except Exception as e:
            raise e

    def get_similar_vectors(self, query_vector, top_n=5) -> list:
        """
        Find similar vectors to the query vector

        Args:
            query_vector (numpy.ndarray): The query vector to compare with the vecotr in the store
            num_results (int): The number of similar vectors to return

        Returns:
            A list of tuples, each containing a vector id and its similarity to the query vector
        """
        if self.persist:
            # Load existing index and sentences
            self._load_vector_store()
        
        result = []
        labels, distances = self.index.knn_query(query_vector, top_n=top_n)

        similar_vectors = [(self.sentences[label], distance) for label, distance in zip(labels, distances)]
        
        return similar_vectors

## Demo of The vector store

Using nomic embed for the demo and a custom index

#### TextSplitter and Tetriver classes

In [4]:
# Pritty print
import termcolor

class TextSplitter:
    def __init__(self, data_path, split_ratio=0.8, random_state=200):
        self.data = pd.read_csv(data_path, delimiter=";")
        self.data_1 = self.data.sample(frac=split_ratio, random_state=random_state)
        self.data_2 = self.data.drop(self.data_1.index)

    def get_data_split(self):
        return self.data_1["text"].tolist(), self.data_2["text"].tolist()

class Retriever:
    def __init__(self, model_name="nomic-ai/nomic-embed-text-v1.5", persist=True, persist_path="test"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True)
        self.vector_store = VectorStore(persist=persist, persist_path=persist_path)

    def create_vector_store(self, data):
        vectors = self.model.encode(data)
        vector_dimension = len(vectors[0])
        new_sentence_vectors = {data[i]: vectors[i] for i in range(len(data))}
        self.vector_store.create_vector_store(new_sentence_vectors)

    def update_vector_store_data(self, data):
        vectors = self.model.encode(data)
        new_sentence_vectors = {data[i]: vectors[i] for i in range(len(data))}
        self.vector_store.update_vector_store(new_sentence_vectors)
        return new_sentence_vectors

    def query_similar_vectors(self, query, top_n=5):
        query_vector = self.model.encode(query)
        similar_vectors = self.vector_store.get_similar_vectors(query_vector, top_n=top_n)
        return similar_vectors

    def print_similar_vectors(self, similar_vectors):
        print("Similarity Vectors:")
        for sentence, similarity_score in similar_vectors:
            print(termcolor.colored(f"- Sentence: {sentence}", "green", "on_grey", ["bold"]))
            print(termcolor.colored(f"  Similarity Score: {similarity_score}", "yellow", "on_grey", ["bold"]))
            print()

ModuleNotFoundError: No module named 'termcolor'

### Showcase

In [None]:
text_splitter = TextSplitter("data.csv")
data_1, data_2 = text_splitter.get_data_split()

retriever = Retriever()
retriever.create_vector_store(data_1)

query = "I want to buy a car"
similar_vectors = retriever.query_similar_vectors(query)

retriever.print_similar_vectors(similar_vectors)

### Adding update of the vector store

In [None]:
# Update the vector store
data_2 = data_2["text"].tolist()
new_sentence_vectors_2 = retriever.update_vector_store_data(data_2)

# Query the vector store
query = "I want to buy a cycle"
query_vector = retriever.model.encode(query)
similar_vectors = retriever.query_similar_vectors(query_vector, top_n=5)
retriever.print_similar_vectors(similar_vectors)

### Adding persistency

In [None]:
# Delete saved vector store
retriver.vector_store.delete_vector_store()

### Evaluate the system on a set of test queries.

### Old way of using it

Might be helpfull as a look up

In [None]:
# Create a VectorStore instance
vector_store = VectorStore()

# Define your sentences
sentences = [
    "I eat mango",
    "mango is my favorite fruit",
    "mango, apple, oranges are fruits",
    "fruits are good for health",
]

# Tokenization and Vocabulary Creation
vocabulary = set()
for sentence in sentences:
    tokens = sentence.lower().split()
    vocabulary.update(tokens)

# Assign unique indices to words in the vocabulary
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Vectorization
sentence_vectors = {}
for sentence in sentences:
    tokens = sentence.lower().split()
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        vector[word_to_index[token]] += 1
    sentence_vectors[sentence] = vector

# Storing in VectorStore
for sentence, vector in sentence_vectors.items():
    vector_store.add_vector(sentence, vector)

# Searching for Similarity
query_sentence = "Mango is the best fruit"
query_vector = np.zeros(len(vocabulary))
query_tokens = query_sentence.lower().split()
for token in query_tokens:
    if token in word_to_index:
        query_vector[word_to_index[token]] += 1

similar_sentences = vector_store.find_similar_vectors(query_vector, num_results=2)

# Print similar sentences
print("Query Sentence:", query_sentence)
print("Similar Sentences:")
for sentence, similarity in similar_sentences:
    print(f"{sentence}: Similarity = {similarity:.4f}")