[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KoltonHauck/BMI6016_VectorDB/blob/main/BMI6016-VectorDB.ipynb)

# Installing Dependencies

In [None]:
pip install openai numpy pandas scikit-learn sentence-transformers rank-bm25 faiss-cpu neo4j

# Imports

# Embedding Methods

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

# Sample corpus
documents = [
    "RAG uses retrieval-augmented generation to enhance responses.",
    "BM25 is a ranking function used in information retrieval.",
    "TF-IDF measures the importance of terms in a document.",
    "Sentence transformers convert text into dense vector embeddings.",
    "LLMs can use embeddings for semantic search."
]

# BM25 Embeddings
tokenized_corpus = [doc.split(" ") for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)
bm25_scores = {doc: bm25.get_scores(doc.split(" ")) for doc in documents}

# TF-IDF Embeddings
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

def get_tfidf_vector(doc):
    return dict(zip(tfidf_feature_names, tfidf_vectorizer.transform([doc]).toarray()[0]))

tfidf_vectors = {doc: get_tfidf_vector(doc) for doc in documents}

# Sentence Transformer Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(documents, convert_to_numpy=True)
embeddingsDict = {documents[i]: embedding for i,embedding in enumerate(embeddings)}

test_doc = documents[0]
print(f"""Document: '{test_doc}'
                BM25 embedding (len: {len(bm25_scores[test_doc])}): {bm25_scores[test_doc]}
              TF-IDF embedding (len: {len(tfidf_vectors[test_doc])}): {tfidf_vectors[test_doc]}
Sentence Transformer embedding (len: {len(embeddingsDict[test_doc])}): {embeddingsDict[test_doc]}
""")


In [None]:
for doc, emb in bm25_scores.items():
  print(doc, emb)

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

def apply_pca(embedding_dict, n_components=2):
    """Applies PCA to reduce the dimensionality of embeddings."""
    doc_keys = list(embedding_dict.keys())
    vectors = np.array(list(embedding_dict.values()))

    pca = PCA(n_components=n_components)
    reduced_vectors = pca.fit_transform(vectors)

    return {doc: reduced_vectors[i] for i, doc in enumerate(doc_keys)}

# Apply PCA to each embedding method
bm25_pca = apply_pca(bm25_scores)
tfidf_pca = apply_pca({k: list(v.values()) for k,v in tfidf_vectors.items()})
sentence_pca = apply_pca(embeddingsDict)

# Print PCA reduced embeddings
print("BM25 PCA Reduced:", bm25_pca)
print("TF-IDF PCA Reduced:", tfidf_pca)
print("Sentence Transformer PCA Reduced:", sentence_pca)

# Plot PCA results
plt.figure(figsize=(8, 6))

# Extract coordinates
def extract_coordinates(pca_dict):
    return np.array(list(pca_dict.values()))

bm25_coords = extract_coordinates(bm25_pca)
tfidf_coords = extract_coordinates(tfidf_pca)
sentence_coords = extract_coordinates(sentence_pca)

plt.scatter(bm25_coords[:, 0], bm25_coords[:, 1], color='red', label='BM25')
plt.scatter(tfidf_coords[:, 0], tfidf_coords[:, 1], color='blue', label='TF-IDF')
plt.scatter(sentence_coords[:, 0], sentence_coords[:, 1], color='green', label='Sentence Transformer')

# Annotate points
for doc, coord in bm25_pca.items():
    plt.annotate(doc[:2], (coord[0], coord[1]), fontsize=9, color='red')
for doc, coord in tfidf_pca.items():
    plt.annotate(doc[:2], (coord[0], coord[1]), fontsize=9, color='blue')
for doc, coord in sentence_pca.items():
    plt.annotate(doc[:2], (coord[0], coord[1]), fontsize=9, color='green')

plt.title("PCA Projection of Embeddings")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()


In [None]:
# show embedding math

# Search

- cosine sim
- euclidean

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer

doc2cosine_sim = {}
doc2euclidean_dist = {}

for doc_i in documents:
  doc2cosine_sim[doc_i] = {}
  doc2euclidean_dist[doc_i] = {}
  for doc_j in documents:
    if doc_j == doc_i:
      continue
    doc2cosine_sim[doc_i][doc_j] = cosine_similarity([embeddingsDict[doc_i]], [embeddingsDict[doc_j]])[0][0]
    doc2euclidean_dist[doc_i][doc_j] = euclidean_distances([embeddingsDict[doc_i]], [embeddingsDict[doc_j]])[0][0]


for doc, score_results in doc2cosine_sim.items():
  print(doc)
  for doc_j, score in score_results.items():
    print(score, doc_j)
  print()

In [None]:
# rank and sort relevant 'documents' based on score

In [None]:
import numpy as np
import faiss

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Query example
query = "How do sentence transformers work?"
query_embedding = model.encode([query], convert_to_numpy=True)

# Search for nearest neighbors
distances, indices = index.search(query_embedding, k=2)

# Print results
print("Query:", query)
for i, idx in enumerate(indices[0]):
    print(f"Match {i+1}: {documents[idx]} (Distance: {distances[0][i]:.4f})")


# RAG

In [None]:
from openai import OpenAI
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

client = OpenAI(
    # api_key=os.environ.get("OPENAI_API_KEY"),
    api_key=""
)

# Function to retrieve relevant context
def retrieve_context(query, k=2):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k=k)
    return [documents[i] for i in indices[0]]

# Function to generate response using OpenAI
def generate_response(query, client):
    context = retrieve_context(query)
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

# Example query
query = "How do sentence transformers work?"
response = generate_response(query, client)
print("Query:", query)
print("Response:", response)


# Knowledge Graph

In [None]:
from neo4j import GraphDatabase

# Neo4j connection details
URI = "bolt://localhost:7687"  # Change as needed
AUTH = ("neo4j", "password")  # Replace with actual credentials

def create_graph(driver):
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")  # Clear existing data

        entities = [
            {"name": "Retrieval-Augmented Generation", "type": "Concept"},
            {"name": "FAISS", "type": "Tool"},
            {"name": "Sentence Transformers", "type": "Model"},
            {"name": "Neo4j", "type": "Database"}
        ]

        relationships = [
            ("Retrieval-Augmented Generation", "USES", "FAISS"),
            ("Retrieval-Augmented Generation", "USES", "Sentence Transformers"),
            ("Knowledge Graph", "STORES", "Neo4j"),
            ("Neo4j", "SUPPORTS", "Graph Queries")
        ]

        for entity in entities:
            session.run(
                "CREATE (n:{type} {{name: $name}})",
                name=entity["name"],
                type=entity["type"]
            )

        for start, rel, end in relationships:
            session.run(
                "MATCH (a {name: $start}), (b {name: $end}) "
                "CREATE (a)-[:{rel}]->(b)",
                start=start,
                rel=rel,
                end=end
            )

driver = GraphDatabase.driver(URI, auth=AUTH)
create_graph(driver)
print("Graph data ingested successfully.")
driver.close()


# GraphRAG

In [None]:
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Neo4j connection details
URI = "bolt://localhost:7687"  # Adjust as needed
AUTH = ("neo4j", "password")  # Replace with actual credentials

driver = GraphDatabase.driver(URI, auth=AUTH)

# Create fulltext index
def create_fulltext_index():
    with driver.session() as session:
        session.run("""
        CREATE FULLTEXT INDEX entity_search IF NOT EXISTS
        FOR (n:Entity) ON EACH [n.name, n.description]
        """)

def create_vector_index():
    with driver.session() as session:
        session.run("""
        CREATE INDEX vector_index IF NOT EXISTS
        FOR (n:Entity) ON (n.embedding)
        OPTIONS {indexProvider: 'vector-1.0', indexConfig: {`vector.dimensions`: 384, `vector.similarity_function`: 'cosine'}}
        """)

# Sample documents
documents = [
    {"name": "Retrieval-Augmented Generation", "description": "A method that enhances LLM responses with document retrieval."},
    {"name": "FAISS", "description": "A library for efficient similarity search of embeddings."},
    {"name": "Sentence Transformers", "description": "A model that converts text into dense embeddings."},
    {"name": "Neo4j", "description": "A graph database used for knowledge graphs."}
]

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def ingest_data():
    with driver.session() as session:
        for doc in documents:
            embedding = model.encode(doc["description"], convert_to_numpy=True).tolist()
            session.run("""
            CREATE (n:Entity {name: $name, description: $description, embedding: $embedding})
            """, name=doc["name"], description=doc["description"], embedding=embedding)

def retrieve_using_fulltext(query):
    with driver.session() as session:
        result = session.run("""
        CALL db.index.fulltext.queryNodes('entity_search', $query) YIELD node, score
        RETURN node.name, node.description, score
        """, query=query)
        return result.data()

def retrieve_using_vector(query):
    query_embedding = model.encode(query, convert_to_numpy=True).tolist()
    with driver.session() as session:
        result = session.run("""
        MATCH (n:Entity)
        RETURN n.name, n.description, cosineSimilarity(n.embedding, $query_embedding) AS score
        ORDER BY score DESC LIMIT 2
        """, query_embedding=query_embedding)
        return result.data()

# Setup indexes and ingest data
create_fulltext_index()
create_vector_index()
ingest_data()

# Example query
query = "How does retrieval-augmented generation work?"
fulltext_results = retrieve_using_fulltext(query)
vector_results = retrieve_using_vector(query)

print("Fulltext Search Results:", fulltext_results)
print("Vector Search Results:", vector_results)

driver.close()


# Graph Agent