In [79]:
#from langchain_community.embeddings.bedrock import BedrockEmbeddings

In [1]:
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
import hashlib
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings
import random
from datasets import load_dataset
from langchain.schema import Document
import time
from evaluate import load



In [2]:
ds = load_dataset("neural-bridge/rag-dataset-1200")

In [3]:

DATA_PATH = "/Users/joudi/Measuring_RAG_Effectiveness/data"

CHROMA_PATH = "/Users/joudi/Measuring_RAG_Effectiveness/database"  

#os.makedirs(CHROMA_PATH)

def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [4]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def calculate_chunk_ids(chunks):
    for chunk in chunks:
        source = chunk.metadata.get("source", "unknown_source")
        page = chunk.metadata.get("page", "unknown_page")
        content_hash = hashlib.md5(chunk.page_content.encode("utf-8")).hexdigest()
        
        # Create a unique ID using source, page, and a hash of the content
        chunk_id = f"{source}:{page}:{content_hash}"
        chunk.metadata["id"] = chunk_id

    return chunks

def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

def generate_unique_id(chunk):
    # Use current timestamp (in milliseconds) for uniqueness
    timestamp = str(int(time.time() * 1000))  # Current timestamp in milliseconds
    
    # Generate a random number to ensure uniqueness
    random_number = random.randint(100000, 999999)

    # use document content or metadata to make the ID more unique
    content_hash = hashlib.md5(chunk.page_content.encode('utf-8')).hexdigest()[:8]  # First 8 chars of MD5 hash
    
    # Create a unique ID combining the timestamp and content hash
    unique_id = f"{timestamp}_{content_hash}_{random_number}"
    
    return unique_id


def add_doc_to_chroma(chunks):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function()
    )
    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
        if new_chunks:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")

def check_num_in_DB(path):
    try:
        # Load the existing database
        db = Chroma(persist_directory=path, embedding_function=get_embedding_function())
        existing_items = db.get(include=[])  # Fetch existing document metadata (IDs are always included)
        num_existing = len(existing_items["ids"])
        print(f"📊 Number of existing documents in DB: {num_existing}")
        return num_existing
    except Exception as e:
        print(f"❌ Error checking existing documents: {e}")
        return 0

#for context from DataSet
def get_context(ds):
    train_chunks = []
    test_chunks = []
    for context in ds["train"]["context"]:
        # Convert context string to Document object
        document = Document(page_content=context)
        # Split the document into smaller chunks
        chunked_documents = split_documents([document])
        # Add the chunked documents to the train_chunks list
        train_chunks.extend(chunked_documents)

    # Iterate over the test context data
    for context in ds["test"]["context"]:
        # Convert context string to Document object
        document = Document(page_content=context)
        # Split the document into smaller chunks
        chunked_documents = split_documents([document])
        # Add the chunked documents to the test_chunks list
        test_chunks.extend(chunked_documents)

    return train_chunks, test_chunks

def add_context_to_chroma(chunks):
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function()
    )
    
    # Assign unique IDs to chunks
    for chunk in chunks:
        chunk.metadata["id"] = generate_unique_id(chunk)
    
    # Check if the number of new chunks is equal to the number of generated IDs
    new_chunk_ids = [chunk.metadata["id"] for chunk in chunks]
    
    print(f"Number of chunks: {len(chunks)}")
    print(f"Number of chunk IDs: {len(new_chunk_ids)}")
    
    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        batch_size = 5000
        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i + batch_size]
            batch_ids = [chunk.metadata["id"] for chunk in batch]
            print(f"Adding batch with {len(batch)} documents and {len(batch_ids)} IDs")
            db.add_documents(batch, ids=batch_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")

"""def add_context_to_chroma(chunks):
    db = Chroma(
            persist_directory=CHROMA_PATH,
            embedding_function=get_embedding_function()
        )
    #chunks_with_ids = calculate_chunk_ids(chunks)
    for chunk in chunks:
        chunk.metadata["id"] = generate_unique_id(chunk)  

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        batch_size = 5000
        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i + batch_size]
            new_chunk_ids = [chunk.metadata["id"] for chunk in batch]
            db.add_documents(batch, ids=new_chunk_ids)
            db.persist()
        if new_chunks:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            db.persist()
        print("✅ Documents successfully added and persisted.")
    else:
        print("✅ No new documents to add")"""




'def add_context_to_chroma(chunks):\n    db = Chroma(\n            persist_directory=CHROMA_PATH,\n            embedding_function=get_embedding_function()\n        )\n    #chunks_with_ids = calculate_chunk_ids(chunks)\n    for chunk in chunks:\n        chunk.metadata["id"] = generate_unique_id(chunk)  \n\n    existing_items = db.get(include=[])\n    existing_ids = set(existing_items["ids"])\n    print(f"Number of existing documents in DB: {len(existing_ids)}")\n\n    new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]\n\n    if len(new_chunks):\n        print(f"👉 Adding new documents: {len(new_chunks)}")\n        batch_size = 5000\n        for i in range(0, len(new_chunks), batch_size):\n            batch = new_chunks[i:i + batch_size]\n            new_chunk_ids = [chunk.metadata["id"] for chunk in batch]\n            db.add_documents(batch, ids=new_chunk_ids)\n            db.persist()\n        if new_chunks:\n            db.add_documents(new_chunks, id

In [5]:
#documents = load_documents()
#chunks = split_documents(documents)
#add_to_chroma(chunks)
#train_chunk, test_chunk = get_context(ds)
#add_context_to_chroma(train_chunk)
#add_context_to_chroma(test_chunk)
check_num_in_DB(CHROMA_PATH)



  embeddings = OllamaEmbeddings(model="nomic-embed-text")
  db = Chroma(persist_directory=path, embedding_function=get_embedding_function())


📊 Number of existing documents in DB: 12826


12826

In [6]:

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


In [34]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 960
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 240
    })
})


In [101]:
answers = []
n_questions = 20
random.seed(11)

# get the answers provided from the dataset
for answer in ds["train"]["answer"]:
    answers.append(answer)

random.shuffle(answers)
answers = answers[:n_questions]
print(answers)

['The KI Work Environment and Health Award consists of a certificate and 30 000 SEK, to use for further joint activities in the workplace to promote health at work. The prize winner may be an employee/co-worker, a manager, a team, a workplace or a department/equivalent.', "The ideal spin serve will, if given the chance, bounce twice on the opponent's side of the table, with the second bounce as close to the endline as possible.", 'The different bat options mentioned are Player Edition, Signature, and Reserve.', "Some common problems that can damage a roof include the sun's UV rays causing deterioration, snow and ice causing leaks, strong gusts of wind tearing off shingles, overhanging tree limbs rubbing against the roof, birds and small wildlife creating holes, insects chewing at fascia boards and eaves, faulty flashing allowing water to seep in, clogged gutters causing water backup, poorly ventilated attics causing shingles to dry out and become brittle, and lack of roof maintenance l

In [102]:
n_questions = 20
random.seed(11)

# get the questions from dataset
questions = []
for question in ds["train"]["question"]:
    questions.append(question)

    
random.shuffle(questions)
questions = questions[:n_questions]

print(questions)
answers_llama = []
answers_qwen = []


['What does the KI Work Environment and Health Award consist of and who can be nominated for it?', 'What is the ideal spin serve in table tennis?', 'What are the different bat options mentioned for the PlayersGrade cricket bat sticker design?', 'What are some common problems that can damage a roof?', "What was the reason for Gwendolyn Cuff's lawsuit against International Business Machine Company (IBM)?", 'Who inspired the Farmer family to consider adoption from China?', 'What is the consequence of a DUI in terms of car insurance according to Nevada laws?', 'What are some of the attractions and activities available in Salem, Massachusetts?', 'What does Sissyboy_D do with a vacuum cleaner in the video?', 'What are the controls for the game Stellar Interface?', 'What are some steps the NCUA has taken to encourage federally chartered credit unions?', 'What is the fundamental non-technical difference between Premiere Elements and Premiere Pro?', 'What are the benefits of L Glutamine as desc

In [103]:
for question in questions:
    answers_llama.append({
        "question": question,
        "answer_rag": query_rag(question), # check the model used
        "answer_without": query_without_context(question)
    })

In [104]:
for question in questions:
    answers_qwen.append({
        "question": question,
        "answer_rag": query_rag(question), # check the model used
        "answer_without": query_without_context(question)
    })

In [120]:
import csv

# Save answers_qwen to a CSV file for future review
with open('answers_qwen.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["question", "answer_rag", "answer_without"])
    writer.writeheader()
    writer.writerows(answers_qwen)

Data saved to answers_qwen.csv


In [121]:
with open('answers_llama.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["question", "answer_rag", "answer_without"])
    writer.writeheader()
    writer.writerows(answers_llama)

In [98]:
def query_rag(query_text: str):
    # get the DB.
    embedding_function = get_embedding_function()  
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    # Build context text from the top results
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    # Format the prompt using the context and the query
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)  
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Generate the model's response
    model = Ollama(model="llama3.2:latest")
    #model = Ollama(model="qwen:1.8b")  
    response_text = model.invoke(prompt)

    # Get source document IDs
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    
    # Format the response for output
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    
    # Output the response
    #print(formatted_response)
    
    return response_text

def query_without_context(query_text: str):
    model = Ollama(model="llama3.2:latest")
    #model = Ollama(model="qwen:1.8b") 

    # Generate the response from the model, without sending any context
    response_text = model.invoke(query_text)

    return response_text

In [33]:
# using metrics for Evaluation
bertscore = load("bertscore")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
# compare the answers of the model with RAG
predictions_qwen_rag = [entry["answer_rag"] for entry in answers_qwen]
results_qwen = bertscore.compute(predictions=predictions_qwen_rag, references=answers, lang="en")
print(f"Precision of Qwen model with context is: {results_qwen['precision']}")
print(f"Average precision of Qwen model with context is: {sum(results_qwen['precision']) / len(results_qwen['precision']):0.3f}")

Precision of Qwen model with context is: [0.9065226316452026, 0.9070388078689575, 0.8869307041168213, 0.8989588022232056, 0.9240614771842957, 0.8878748416900635, 0.8651293516159058, 0.8624709844589233, 0.8369998931884766, 0.8711046576499939, 0.8568368554115295, 0.9193529486656189, 0.8791593313217163, 0.9931979775428772, 0.8557924032211304, 0.8376179337501526, 0.877596378326416, 0.8185430765151978, 0.8468308448791504, 0.9110895395278931]
Average precision of Qwen model with context is: 0.882


In [None]:
# compare the answers of the model with RAG
predictions_llama_rag = [entry["answer_rag"] for entry in answers_llama]
results_llama = bertscore.compute(predictions=predictions_llama_rag, references=answers, lang="en")
print(f"Precision of Llama model with context is: {results_qwen['precision']}")
print(f"Average precision of Llama model with context is: {sum(results_llama['precision']) / len(results_llama['precision']):0.3f}")

Precision of Llama model with context is: [0.9065226316452026, 0.9070388078689575, 0.8869307041168213, 0.8989588022232056, 0.9240614771842957, 0.8878748416900635, 0.8651293516159058, 0.8624709844589233, 0.8369998931884766, 0.8711046576499939, 0.8568368554115295, 0.9193529486656189, 0.8791593313217163, 0.9931979775428772, 0.8557924032211304, 0.8376179337501526, 0.877596378326416, 0.8185430765151978, 0.8468308448791504, 0.9110895395278931]
Average precision of Llama model with context is: 0.881


In [124]:
# compare the answers of the Qwen _without_ RAG
predictions_qwen = [entry["answer_without"] for entry in answers_qwen]
results_qwen = bertscore.compute(predictions=predictions_qwen, references=answers, lang="en")
print(f"Precision of Qwen model without context is: {results_qwen['precision']}")
print(f"Average precision of Qwen model without context is: {sum(results_qwen['precision']) / len(results_qwen['precision']):0.3f}")

Precision of Qwen model without context is: [0.8662627935409546, 0.7969239950180054, 0.812042236328125, 0.8282989859580994, 0.8162297010421753, 0.8394553661346436, 0.7863755226135254, 0.8133270740509033, 0.847653865814209, 0.8154497742652893, 0.8009121417999268, 0.7930828332901001, 0.8174949884414673, 0.8692981004714966, 0.8750858306884766, 0.8297284245491028, 0.8049622774124146, 0.8117737174034119, 0.8218947052955627, 0.8374470472335815]
Average precision of Qwen model without context is: 0.824


In [125]:
# compare the answers of the Llama _without_ RAG
predictions_llama = [entry["answer_without"] for entry in answers_llama]
results_llama = bertscore.compute(predictions=predictions_llama, references=answers, lang="en")
print(f"Precision of Llama model without context is: {results_qwen['precision']}")
print(f"Average precision of Llama model without context is: {sum(results_llama['precision']) / len(results_llama['precision']):0.3f}")

Precision of Llama model without context is: [0.8662627935409546, 0.7969239950180054, 0.812042236328125, 0.8282989859580994, 0.8162297010421753, 0.8394553661346436, 0.7863755226135254, 0.8133270740509033, 0.847653865814209, 0.8154497742652893, 0.8009121417999268, 0.7930828332901001, 0.8174949884414673, 0.8692981004714966, 0.8750858306884766, 0.8297284245491028, 0.8049622774124146, 0.8117737174034119, 0.8218947052955627, 0.8374470472335815]
Average precision of Llama model without context is: 0.825
