In [1]:

%%capture
!pip install faiss-cpu sentence-transformers langchain langchain_community langchain-huggingface langchain-ollama


In [2]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Load your CSV dataset
df = pd.read_csv('/mnt/f/datasets/data.csv')

# Filter for valid full content
texts = df['full_content'].dropna().sample(n=100, random_state=42).tolist()

# Show how many articles we kept
print(f"Using {len(texts)} non-null articles for the vector store.")

# Initialize embeddings and build FAISS index
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts, embeddings)


Using 100 non-null articles for the vector store.


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [48]:

from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = OllamaLLM(model="mistral")

krag_template = PromptTemplate(
    input_variables=["context", "triples", "question"],
    template="""
Context:
{context}

Knowledge Graph Triples:
{triples}

Question:
{question}

Answer:
""")

krag_chain = krag_template | llm | StrOutputParser()

rag_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Context:
{context}

Question:
{question}

Answer:
""")

rag_chain = rag_template | llm | StrOutputParser()


In [49]:
import spacy
import networkx as nx
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Extract subject-predicate-object triples from each sentence
def extract_relationships(doc):
    relationships = []
    for sent in doc.sents:
        root = sent.root
        subj = next((child for child in root.children if child.dep_ == "nsubj"), None)
        obj = next((child for child in root.children if child.dep_ in ["dobj", "pobj"]), None)
        if subj and obj:
            relationships.append((subj.text, root.lemma_, obj.text))
    return relationships

# Build KG with NetworkX
G = nx.DiGraph()
all_triples = []

for text in texts:  # 'texts' is your list of sampled full_content entries
    doc = nlp(text)
    triples = extract_relationships(doc)
    all_triples.extend(triples)
    for subj, pred, obj in triples:
        G.add_edge(subj, obj, relation=pred)

# Prepare for embedding
triple_texts = [f"{s} —[{p}]→ {o}" for s, p, o in G.edges(data="relation")]
triple_tuples = [(s, p, o) for s, o, p in G.edges(data="relation")]

# Embed triples using sentence-transformers
embedder = SentenceTransformer("all-MiniLM-L6-v2")
triple_embeddings = embedder.encode(triple_texts)

# Build FAISS index for triple search
index = faiss.IndexFlatL2(triple_embeddings.shape[1])
index.add(np.array(triple_embeddings))


In [50]:
# show KG 
import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 7))
# pos = nx.spring_layout(G, k=0.5)
# nx.draw(G, pos, with_labels=True, node_size=500, font_size=8)
# edge_labels = nx.get_edge_attributes(G, 'relation')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7)
# plt.title("KG")
# plt.show()

In [76]:
# Limit to top N nodes by degree (most connected)
N = 50
degree_dict = dict(G.degree())
top_nodes = sorted(degree_dict, key=degree_dict.get, reverse=True)[:N]
subgraph = G.subgraph(top_nodes)

# plt.figure(figsize=(12, 9))
# pos = nx.spring_layout(subgraph, k=0.7)
# nx.draw(subgraph, pos, with_labels=True, node_size=800, font_size=9, node_color="#69b3a2")
# edge_labels = nx.get_edge_attributes(subgraph, 'relation')
# nx.draw_networkx_edge_labels(subgraph, pos, edge_labels=edge_labels, font_size=7)
# plt.title("Top 50 Entity Knowledge Graph")
# plt.axis('off')
# plt.show()

In [84]:
# Format triples for LLM prompt
def format_triples(triples):
    return [f"{s} —[{p}]→ {o}" for s, p, o in triples]

# Score recall based on how many ground truth triples are reflected in the answer for KRAG 
def score_recall_KRAG(triples, answer):
    total_parts = 0
    matched_parts = 0
    for s, p, o in triples:
        for part in [s, p, o]: # GT Subject, predicate, and object 
            total_parts += 1
            if part.lower() in answer.lower(): # if the subject, predicate, and object ground truth appear in LLM answer, then increment recall score
                # print("triple used: ", part.lower())
                matched_parts += 1
    print("Triple Count:", matched_parts) # threshold 
    return matched_parts / total_parts if total_parts else 0

# Score recall based on how well ground truth context is reflected in the answer for RAG 
def score_recall_RAG(context, answer): 
    print("number of words in context also in answer: ", sum(1 for word in answer.split() if word.lower() in context.lower())) # threshold 
    return sum(1 for word in answer.split() if word.lower() in context.lower()) / len(answer.split())
    
# Retrieve documents and generate an answer with context
def rag_query(question, k = 2):
    docs = vectorstore.similarity_search(question, k=k)
    context = "\n".join([doc.page_content for doc in docs])
    answer = rag_chain.invoke({"context": context, "question": question}).strip()
    return answer, context


def krag_query(question, k=5):
    # Embed the question and retrieve top-k triples from FAISS
    q_embed = embedder.encode([question])
    _, I = index.search(np.array(q_embed), k)
    
    # Get the matching triples
    retrieved_triples = [triple_tuples[i] for i in I[0]]
    triples_text = "\n".join(format_triples(retrieved_triples))

    # Retrieve vectorstore context as usual
    context = "\n".join([doc.page_content for doc in vectorstore.similarity_search(question, k=2)])

    # Run the LLM with context and embedding-retrieved triples
    # print("Ground Truth Triples:", retrieved_triples)
    answer = krag_chain.invoke({"context": context, "triples": triples_text, "question": question}).strip()

    return answer, retrieved_triples



In [85]:
# Generate sample questions based on entities in KG 
sample_entities = list(G.nodes())[:10]

questions = [f"What does {entity} do?" for entity in sample_entities]

# Optionally preview them
for q in questions:
    print("-", q)


- What does Ltd. do?
- What does stake do?
- What does fund do?
- What does shares do?
- What does % do?
- What does MD do?
- What does position do?
- What does LLC do?
- What does Bank do?
- What does Management do?


In [86]:
for q in questions:
    print(f"Question: {q}")

    rag_answer, rag_context = rag_query(q)
    krag_answer, ground_truth = krag_query(q)

    krag_recall = score_recall_KRAG(ground_truth, krag_answer)
    rag_recall = score_recall_RAG(rag_context, rag_answer)

    # print(f"  RAG Answer:  {rag_answer}")
    # print(f"  kRAG Answer: {krag_answer}")
    print(f"  KRAG Recall from triples: {krag_recall:.2f}\n")
    print(f"  RAG Recall from context: {rag_recall:.2f}\n")

Question: What does Ltd. do?
Triple Count: 5
number of words in context also in answer:  25
  KRAG Recall from triples: 0.33

  RAG Recall from context: 0.78

Question: What does stake do?
Triple Count: 9
number of words in context also in answer:  62
  KRAG Recall from triples: 0.60

  RAG Recall from context: 0.65

Question: What does fund do?
Triple Count: 8
number of words in context also in answer:  48
  KRAG Recall from triples: 0.53

  RAG Recall from context: 0.65

Question: What does shares do?
Triple Count: 11
number of words in context also in answer:  54
  KRAG Recall from triples: 0.73

  RAG Recall from context: 0.65

Question: What does % do?
Triple Count: 15
number of words in context also in answer:  74
  KRAG Recall from triples: 1.00

  RAG Recall from context: 0.80

Question: What does MD do?
Triple Count: 5
number of words in context also in answer:  33
  KRAG Recall from triples: 0.33

  RAG Recall from context: 0.47

Question: What does position do?
Triple Count:

### So now a good thing to graph would be how the triples found in ground truth increases the KRAG recall and how the number of words in context that are also in answer increase the RAG recall to see the influence of both 

### We can see here that 5 triples found in ground truth does not increase KRAG recall above RAG recall by a significant margin, but when we find 9 triples in ground truth we near RAG recall with 60%