

https://colab.research.google.com/github/KoltonHauck/Agents_and_GraphRAG/blob/main/Agents&GraphRAG.ipynb

# Imports

# Embedding Methods

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

# Sample corpus
documents = [
    "RAG uses retrieval-augmented generation to enhance responses.",
    "BM25 is a ranking function used in information retrieval.",
    "TF-IDF measures the importance of terms in a document.",
    "Sentence transformers convert text into dense vector embeddings.",
    "LLMs can use embeddings for semantic search."
]

# BM25 Embeddings
tokenized_corpus = [doc.split(" ") for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)
bm25_scores = {doc: bm25.get_scores(doc.split(" ")) for doc in documents}

# TF-IDF Embeddings
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

def get_tfidf_vector(doc):
    return dict(zip(tfidf_feature_names, tfidf_vectorizer.transform([doc]).toarray()[0]))

tfidf_vectors = {doc: get_tfidf_vector(doc) for doc in documents}

# Sentence Transformer Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(documents, convert_to_numpy=True)

def print_embeddings():
    print("\nBM25 Scores:")
    for doc, scores in bm25_scores.items():
        print(f"{doc}: {scores}")

    print("\nTF-IDF Vectors:")
    for doc, vector in tfidf_vectors.items():
        print(f"{doc}: {vector}")

    print("\nSentence Transformer Embeddings:")
    for doc, emb in zip(documents, embeddings):
        print(f"{doc}: {emb[:5]}...")  # Print first 5 dimensions for brevity

print_embeddings()


# Search

- cosine sim
- euclidean

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer

# Sample sentences
sentence1 = "RAG enhances response generation using retrieval."
sentence2 = "Retrieval-augmented generation improves answer quality."

# Load Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings
embedding1 = model.encode(sentence1, convert_to_numpy=True)
embedding2 = model.encode(sentence2, convert_to_numpy=True)

# Compute similarity metrics
cosine_sim = cosine_similarity([embedding1], [embedding2])[0][0]
euclidean_dist = euclidean_distances([embedding1], [embedding2])[0][0]

# Print results
print(f"Cosine Similarity: {cosine_sim:.4f}")
print(f"Euclidean Distance: {euclidean_dist:.4f}")


# Vector Index

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Sample corpus
documents = [
    "RAG uses retrieval-augmented generation to enhance responses.",
    "BM25 is a ranking function used in information retrieval.",
    "TF-IDF measures the importance of terms in a document.",
    "Sentence transformers convert text into dense vector embeddings.",
    "LLMs can use embeddings for semantic search."
]

# Load Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings
embeddings = model.encode(documents, convert_to_numpy=True)

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Query example
query = "How do sentence transformers work?"
query_embedding = model.encode([query], convert_to_numpy=True)

# Search for nearest neighbors
distances, indices = index.search(query_embedding, k=2)

# Print results
print("Query:", query)
for i, idx in enumerate(indices[0]):
    print(f"Match {i+1}: {documents[idx]} (Distance: {distances[0][i]:.4f})")


# RAG

In [None]:
import openai
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Sample corpus
documents = [
    "RAG uses retrieval-augmented generation to enhance responses.",
    "BM25 is a ranking function used in information retrieval.",
    "TF-IDF measures the importance of terms in a document.",
    "Sentence transformers convert text into dense vector embeddings.",
    "LLMs can use embeddings for semantic search."
]

# Load Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings
embeddings = model.encode(documents, convert_to_numpy=True)

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Function to retrieve relevant context
def retrieve_context(query, k=2):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k=k)
    return [documents[i] for i in indices[0]]

# Function to generate response using OpenAI
def generate_response(query):
    context = retrieve_context(query)
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}]
    )
    return response["choices"][0]["message"]["content"]

# Example query
query = "How do sentence transformers work?"
response = generate_response(query)
print("Query:", query)
print("Response:", response)


# Knowledge Graph

In [None]:
from neo4j import GraphDatabase

# Neo4j connection details
URI = "bolt://localhost:7687"  # Change as needed
AUTH = ("neo4j", "password")  # Replace with actual credentials

def create_graph(driver):
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")  # Clear existing data

        entities = [
            {"name": "Retrieval-Augmented Generation", "type": "Concept"},
            {"name": "FAISS", "type": "Tool"},
            {"name": "Sentence Transformers", "type": "Model"},
            {"name": "Neo4j", "type": "Database"}
        ]

        relationships = [
            ("Retrieval-Augmented Generation", "USES", "FAISS"),
            ("Retrieval-Augmented Generation", "USES", "Sentence Transformers"),
            ("Knowledge Graph", "STORES", "Neo4j"),
            ("Neo4j", "SUPPORTS", "Graph Queries")
        ]

        for entity in entities:
            session.run(
                "CREATE (n:{type} {{name: $name}})",
                name=entity["name"],
                type=entity["type"]
            )

        for start, rel, end in relationships:
            session.run(
                "MATCH (a {name: $start}), (b {name: $end}) "
                "CREATE (a)-[:{rel}]->(b)",
                start=start,
                rel=rel,
                end=end
            )

driver = GraphDatabase.driver(URI, auth=AUTH)
create_graph(driver)
print("Graph data ingested successfully.")
driver.close()


# GraphRAG

In [None]:
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Neo4j connection details
URI = "bolt://localhost:7687"  # Adjust as needed
AUTH = ("neo4j", "password")  # Replace with actual credentials

driver = GraphDatabase.driver(URI, auth=AUTH)

# Create fulltext index
def create_fulltext_index():
    with driver.session() as session:
        session.run("""
        CREATE FULLTEXT INDEX entity_search IF NOT EXISTS
        FOR (n:Entity) ON EACH [n.name, n.description]
        """)

def create_vector_index():
    with driver.session() as session:
        session.run("""
        CREATE INDEX vector_index IF NOT EXISTS
        FOR (n:Entity) ON (n.embedding)
        OPTIONS {indexProvider: 'vector-1.0', indexConfig: {`vector.dimensions`: 384, `vector.similarity_function`: 'cosine'}}
        """)

# Sample documents
documents = [
    {"name": "Retrieval-Augmented Generation", "description": "A method that enhances LLM responses with document retrieval."},
    {"name": "FAISS", "description": "A library for efficient similarity search of embeddings."},
    {"name": "Sentence Transformers", "description": "A model that converts text into dense embeddings."},
    {"name": "Neo4j", "description": "A graph database used for knowledge graphs."}
]

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def ingest_data():
    with driver.session() as session:
        for doc in documents:
            embedding = model.encode(doc["description"], convert_to_numpy=True).tolist()
            session.run("""
            CREATE (n:Entity {name: $name, description: $description, embedding: $embedding})
            """, name=doc["name"], description=doc["description"], embedding=embedding)

def retrieve_using_fulltext(query):
    with driver.session() as session:
        result = session.run("""
        CALL db.index.fulltext.queryNodes('entity_search', $query) YIELD node, score
        RETURN node.name, node.description, score
        """, query=query)
        return result.data()

def retrieve_using_vector(query):
    query_embedding = model.encode(query, convert_to_numpy=True).tolist()
    with driver.session() as session:
        result = session.run("""
        MATCH (n:Entity)
        RETURN n.name, n.description, cosineSimilarity(n.embedding, $query_embedding) AS score
        ORDER BY score DESC LIMIT 2
        """, query_embedding=query_embedding)
        return result.data()

# Setup indexes and ingest data
create_fulltext_index()
create_vector_index()
ingest_data()

# Example query
query = "How does retrieval-augmented generation work?"
fulltext_results = retrieve_using_fulltext(query)
vector_results = retrieve_using_vector(query)

print("Fulltext Search Results:", fulltext_results)
print("Vector Search Results:", vector_results)

driver.close()


# Graph Agent