In [79]:
#from langchain_community.embeddings.bedrock import BedrockEmbeddings

In [4]:
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
import hashlib
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings
import random
from datasets import load_dataset
from langchain.schema import Document
import time



In [5]:
ds = load_dataset("neural-bridge/rag-dataset-1200")

In [6]:

DATA_PATH = "/Users/joudi/Measuring_RAG_Effectiveness/data"

CHROMA_PATH = "/Users/joudi/Measuring_RAG_Effectiveness/database"  

#os.makedirs(CHROMA_PATH)

def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [7]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def calculate_chunk_ids(chunks):
    for chunk in chunks:
        source = chunk.metadata.get("source", "unknown_source")
        page = chunk.metadata.get("page", "unknown_page")
        content_hash = hashlib.md5(chunk.page_content.encode("utf-8")).hexdigest()
        
        # Create a unique ID using source, page, and a hash of the content
        chunk_id = f"{source}:{page}:{content_hash}"
        chunk.metadata["id"] = chunk_id

    return chunks

def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

def generate_unique_id(chunk):
    # Use current timestamp (in milliseconds) for uniqueness
    timestamp = str(int(time.time() * 1000))  # Current timestamp in milliseconds
    
    # Generate a random number to ensure uniqueness
    random_number = random.randint(100000, 999999)

    # use document content or metadata to make the ID more unique
    content_hash = hashlib.md5(chunk.page_content.encode('utf-8')).hexdigest()[:8]  # First 8 chars of MD5 hash
    
    # Create a unique ID combining the timestamp and content hash
    unique_id = f"{timestamp}_{content_hash}_{random_number}"
    
    return unique_id


def add_doc_to_chroma(chunks):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function()
    )
    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
        if new_chunks:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")

def check_num_in_DB(path):
    try:
        # Load the existing database
        db = Chroma(persist_directory=path, embedding_function=get_embedding_function())
        existing_items = db.get(include=[])  # Fetch existing document metadata (IDs are always included)
        num_existing = len(existing_items["ids"])
        print(f"📊 Number of existing documents in DB: {num_existing}")
        return num_existing
    except Exception as e:
        print(f"❌ Error checking existing documents: {e}")
        return 0

#for context from DataSet
def get_context(ds):
    train_chunks = []
    test_chunks = []
    for context in ds["train"]["context"]:
        # Convert context string to Document object
        document = Document(page_content=context)
        # Split the document into smaller chunks
        chunked_documents = split_documents([document])
        # Add the chunked documents to the train_chunks list
        train_chunks.extend(chunked_documents)

    # Iterate over the test context data
    for context in ds["test"]["context"]:
        # Convert context string to Document object
        document = Document(page_content=context)
        # Split the document into smaller chunks
        chunked_documents = split_documents([document])
        # Add the chunked documents to the test_chunks list
        test_chunks.extend(chunked_documents)

    return train_chunks, test_chunks

def add_context_to_chroma(chunks):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function()
    )
    
    # Assign unique IDs to chunks
    for chunk in chunks:
        chunk.metadata["id"] = generate_unique_id(chunk)
    
    # Check if the number of new chunks is equal to the number of generated IDs
    new_chunk_ids = [chunk.metadata["id"] for chunk in chunks]
    
    print(f"Number of chunks: {len(chunks)}")
    print(f"Number of chunk IDs: {len(new_chunk_ids)}")
    
    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        batch_size = 5000
        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i + batch_size]
            batch_ids = [chunk.metadata["id"] for chunk in batch]
            print(f"Adding batch with {len(batch)} documents and {len(batch_ids)} IDs")
            db.add_documents(batch, ids=batch_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")

"""def add_context_to_chroma(chunks):
    db = Chroma(
            persist_directory=CHROMA_PATH,
            embedding_function=get_embedding_function()
        )
    #chunks_with_ids = calculate_chunk_ids(chunks)
    for chunk in chunks:
        chunk.metadata["id"] = generate_unique_id(chunk)  

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        batch_size = 5000
        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i + batch_size]
            new_chunk_ids = [chunk.metadata["id"] for chunk in batch]
            db.add_documents(batch, ids=new_chunk_ids)
            db.persist()
        if new_chunks:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")"""




'def add_context_to_chroma(chunks):\n    db = Chroma(\n            persist_directory=CHROMA_PATH,\n            embedding_function=get_embedding_function()\n        )\n    #chunks_with_ids = calculate_chunk_ids(chunks)\n    for chunk in chunks:\n        chunk.metadata["id"] = generate_unique_id(chunk)  \n\n    existing_items = db.get(include=[])\n    existing_ids = set(existing_items["ids"])\n    print(f"Number of existing documents in DB: {len(existing_ids)}")\n\n    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]\n\n    if len(new_chunks):\n        print(f"👉 Adding new documents: {len(new_chunks)}")\n        batch_size = 5000\n        for i in range(0, len(new_chunks), batch_size):\n            batch = new_chunks[i:i + batch_size]\n            new_chunk_ids = [chunk.metadata["id"] for chunk in batch]\n            db.add_documents(batch, ids=new_chunk_ids)\n            db.persist()\n        if new_chunks:\n            db.add_documents(new_chunks, id

In [12]:
#documents = load_documents()
#chunks = split_documents(documents)
#add_to_chroma(chunks)
#train_chunk, test_chunk = get_context(ds)
#add_context_to_chroma(train_chunk)
#add_context_to_chroma(test_chunk)
#check_num_in_DB(CHROMA_PATH)



Number of chunks: 5688
Number of chunk IDs: 5688
Number of existing documents in DB: 5688
👉 Adding new documents: 5688
Adding batch with 5000 documents and 5000 IDs
Adding batch with 688 documents and 688 IDs
✅ Documents successfully added and persisted.
Number of chunks: 1450
Number of chunk IDs: 1450
Number of existing documents in DB: 11376
👉 Adding new documents: 1450
Adding batch with 1450 documents and 1450 IDs
✅ Documents successfully added and persisted.


In [8]:

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


In [9]:
#print(ds)

In [11]:
#print(ds)
n_questions = 20
random.seed(11)
questions = []
for question in ds["train"]["question"]:
    questions.append(question)
random.shuffle(questions)
questions = questions[:n_questions]
#for question in questions:
    #print(question)

answers_llama = []
answers_qwen = []

"""for question in questions:
    answers_llama.append({
        "question": question,
        "answer_rag": query_rag(question),
        "answer_without": query_without_context(question)
    })"""

for question in questions:
    answers_qwen.append({
        "question": question,
        "answer_rag": query_rag(question),
        "answer_without": query_without_context(question)
    })

  embeddings = OllamaEmbeddings(model="nomic-embed-text")
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
  model = Ollama(model="qwen")


Response: The KI Work Environment and Health Award consists of a certificate and 30 000 SEK, to use for further joint activities in the workplace to promote health at work. Co--workers and students at KI are invited to nominate candidates between 22 January – 9 April 2018.
Sources: ['1735599886871_6710c5b0_373625', '1735600510549_6710c5b0_190111', '1735599886871_2f168635_929503', '1735600510549_2f168635_774676', '1735599886875_8c5880f4_102845']
Response: The ideal spin serve in table tennis will bounce twice on the opponent's side of the table, with the second bounce as close to the endline as possible.
Sources: ['1735599886872_b9881eb2_173864', '1735600510550_b9881eb2_484603', '1735599886872_ec1f206f_250350', '1735600510550_ec1f206f_960435', '1735600510550_53a92d81_380577']
Response: The three bat options mentioned for the PlayersGrade cricket bat sticker design are:

1) Player Edition
2) Signature
3) Reserve
Sources: ['1735599886873_626265b7_531222', '1735600510550_626265b7_837064', 

In [23]:
for item in answers:
    print(item)

{'question': 'What does the KI Work Environment and Health Award consist of and who can be nominated for it?', 'answer_rag': 'According to the provided context, the KI Work Environment and Health Award consists of a certificate and 30,000 SEK. The award winner may be an employee/co-worker, a manager, a team, a workplace, or a department/equivalent.', 'answer_without': 'The KI Work Environment and Health Award is an annual award presented by the Kinexus Industry Association (KI) to recognize organizations that demonstrate exceptional commitment to creating a safe, healthy, and sustainable work environment. The award acknowledges companies that go above and beyond in maintaining a positive and supportive workplace culture.\n\nHere are the key components of the KI Work Environment and Health Award:\n\n1. **Work Environment**: Organizations are recognized for their efforts in promoting a safe, healthy, and inclusive work environment.\n2. **Health**: The award takes into account initiatives

In [80]:
#print(answers)

In [10]:
def query_rag(query_text: str):
    # get the DB.
    embedding_function = get_embedding_function()  
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    # Build context text from the top results
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    # Format the prompt using the context and the query
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)  
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Generate the model's response
    #model = Ollama(model="llama3")
    model = Ollama(model="qwen")  
    response_text = model.invoke(prompt)

    # Get source document IDs
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    
    # Format the response for output
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    
    # Output the response
    print(formatted_response)
    
    return response_text

def query_without_context(query_text: str):
    model = Ollama(model="llama3") 

    # Generate the response from the model, without sending any context
    response_text = model.invoke(query_text)

    return response_text

In [10]:
query_text = "How does defeasible reasoning differ from deductive reasoning?"
response_rag = query_rag(query_text)
#print(response_rag)

  model = Ollama(model="llama3")


Response: According to the provided context, defeasible reasoning differs from deductive reasoning in that it involves inferences that are not necessarily true or certain, but rather probable, plausible, or typical. In other words, defeasible reasoning allows for exceptions and retraction of conclusions based on new information, whereas deductive reasoning is a strict inference rule where the conclusion follows necessarily and without exception.

In particular, deductive rules are represented by "A1,...,An → B", which means that if A1,...,An are true, then B must be true with no exceptions. On the other hand, defeasible rules are represented by "A1,...,An ⇒ B", which means that if A1,...,An are true, then probably or typically B is true, but there may be exceptional circumstances where this conclusion does not hold.

In summary, defeasible reasoning is concerned with inferences that are not necessarily certain, whereas deductive reasoning is concerned with strict and exception-free con

In [68]:
response_without = query_without_context(query_text)
print(response_without)

Defeasible reasoning and deductive reasoning are both forms of logical reasoning, but they differ in their approach to drawing conclusions.

**Deductive Reasoning:**

In deductive reasoning, a conclusion follows necessarily and with absolute certainty from the premises. The inference is based on a strict set of rules or axioms that guarantee the validity of the argument. Deductive arguments are:

1. Universally applicable
2. Based on formal logic
3. Yield a certain conclusion

Examples: All humans are mortal, Socrates is human, ∴ Socrates is mortal.

**Defeasible Reasoning:**

In defeasible reasoning, a conclusion is drawn from the premises, but it may not be absolutely certain or universally applicable. Defeasibility means that the argument can be defeated or overruled by additional information or alternative perspectives. Defeasible arguments are:

1. Context-dependent
2. Based on probabilistic or inductive logic
3. Yield a plausible or likely conclusion, but not necessarily definiti