# Vector Store Implementation

This is an implementation of a vector store that can leverage embedding models to create our vectors

### Imports

- Numpy
- Pands

##### Using files from:

http://ir.dcs.gla.ac.uk/resources/test_collections/time/

In [5]:
import numpy as np
import pandas as pd

ModuleNotFoundError: No module named 'numpy'

### Vector DB class

In [7]:
class VectorStore:
    def __init__(self, vector_dimension = None, metric="cosine"):

        if vector_dimension is None:
            print("You should pass the vector size")
            
        # Vector dimension 
        self.vector_dimension = vector_dimension
        # Dictionary to store the vectors
        self.stored_vectors = {}
        # A dictionary for indexing structure for retrieval
        self.similarity_index = {}


    def create_vector_store(new_sentence_vectors_1, persist=True):
        pass

        
    def add_vector(self, vector_id, vector) -> None:
        """
        Add a vector to the vector Database

        Args:
            vector_id (str or int): A unique id for the vector
            vector (numpy.ndarray): The vector to be stored
        """
        # Normalize the vector
        vector = vector / np.linalg.norm(vector)
        self.stored_vectors[vector_id] = vector
        self.update_index(vector_id, vector)

    def get_vector(self, vector_id) -> np.ndarray:
        """
        Get a vector from the vector DB

        Args:
            vector_id (str or int): The id of the vector to retrieve

        Returns:
            The vector with the given id, or None if it doesn't exist
        """
        return self.stored_vectors.get(vector_id)

    def _update_index(self, vector_id, vector) -> None:
        """
        Update the indexing structure for the vector DB
        """
        for existing_id, existing_vector in self.stored_vectors.items():
            # Compute the dot product of the vectors
            similarity = np.dot(vector, existing_vector)
            if existing_id not in self.similarity_index:
                self.similarity_index[existing_id] = {}
            
            self.similarity_index[existing_id][vector_id] = similarity

    def get_similar_vectors(self, query_vector, num_results=5) -> list:
        """
        Find similar vectors to the query vector

        Args:
            query_vector (numpy.ndarray): The query vector
            num_results (int): The number of similar vectors to return

        Returns:
            A list of tuples, each containing a vector id and its similarity to the query vector
        """
        result = []
        for vector_id, vector in self.stored_vectors.items():
            similarity = self._compute_similarity(query_vector, vector)
            result.append((vector_id, similarity))
            
        # Sort by the similarity in descending order
        result.sort(key=lambda x: x[1], reverse=True)
        
        # Return the num_results most similar ones
        return result[:num_results]

    def _compute_similarity(self, query_vector, vector) -> float:
        """
        Compute the similarity between two vectors

        Args:
            query_vector (numpy.ndarray): The query vector
            vector (numpy.ndarray): The vector to compare

        Returns:
            The dot product of the vectors, normalized by the product of their norms
        """
        return np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))

### Example

Example of the usage of a very simple vector store 

In [None]:
# Create a VectorStore instance
vector_store = VectorStore()

# Define your sentences
sentences = [
    "I eat mango",
    "mango is my favorite fruit",
    "mango, apple, oranges are fruits",
    "fruits are good for health",
]

# Tokenization and Vocabulary Creation
vocabulary = set()
for sentence in sentences:
    tokens = sentence.lower().split()
    vocabulary.update(tokens)

# Assign unique indices to words in the vocabulary
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Vectorization
sentence_vectors = {}
for sentence in sentences:
    tokens = sentence.lower().split()
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        vector[word_to_index[token]] += 1
    sentence_vectors[sentence] = vector

# Storing in VectorStore
for sentence, vector in sentence_vectors.items():
    vector_store.add_vector(sentence, vector)

# Searching for Similarity
query_sentence = "Mango is the best fruit"
query_vector = np.zeros(len(vocabulary))
query_tokens = query_sentence.lower().split()
for token in query_tokens:
    if token in word_to_index:
        query_vector[word_to_index[token]] += 1

similar_sentences = vector_store.find_similar_vectors(query_vector, num_results=2)

# Print similar sentences
print("Query Sentence:", query_sentence)
print("Similar Sentences:")
for sentence, similarity in similar_sentences:
    print(f"{sentence}: Similarity = {similarity:.4f}")

### Demon

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the Embedding Model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

# Get data from csv
data = pd.read_csv("data.csv", delimiter=";")

# Take 80% of the data to create the vector store, remaining 20% to update the vector store
# To showcase the ability of updating the vector store
data_1 = data.sample(frac=0.8, random_state=200)
data_2 = data.drop(data_1.index)

# Convert data to list
data_1 = data_1["text"].tolist()

print(data_1)

# Encode the data, have data and vector in dictionary, data keys, vector values
vectors_1 = model.encode(data_1)

# Get the vector dimension
vector_dimension = len(vectors_1[0])

# Create a dictionary with id and vectors
new_sentence_vectors_1 = {data_1[i]: vectors_1[i] for i in range(len(data_1))}

# Create a vector store. Select metric 'cosine' is the one I implemented (other possibility 'l1' or 'ip')
vector_store = VectorStore(vector_dimension, metric="cosine")

# Create the vector store, set persist to True to save the vector store on disk
# Implment persitence in the future
vector_store.create_vector_store(new_sentence_vectors_1, persist=False)

# Query the vector store
query = "I want to buy a car"
query_vector = model.encode(query)
similar_vectors = vector_store.get_similar_vectors(query_vector, top_n=5)
print("Similarity Vectors:")
for sentence, similarity_score in similar_vectors:
    print(f"- Sentence: {sentence}")
    print(f"  Similarity Score: {similarity_score}")
    print()

### Adding update of the vector store

In [None]:
# Update the vector store
data_2 = data_2["text"].tolist()
vectors_2 = model.encode(data_2)
new_sentence_vectors_2 = {data_2[i]: vectors_2[i] for i in range(len(data_2))}
vector_store.update_vector_store(new_sentence_vectors_2)

# Query the vector store
query = "I want to buy a cycle"
query_vector = model.encode(query)
similar_vectors = vector_store.get_similar_vectors(query_vector, top_n=5)
print("Similarity Vectors:")
for sentence, similarity_score in similar_vectors:
    print(f"- Sentence: {sentence}")
    print(f"  Similarity Score: {similarity_score}")
    print()

### Adding persistency