In [None]:
#Import all required libraries
!pip install PyPDF2
!pip install nltk
!pip install faiss-cpu
!pip install rouge_score
!pip install bert-score
from PyPDF2 import PdfReader
import os
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np
import pandas as pd
import re
from transformers import pipeline
from rouge_score import rouge_scorer
from bert_score import BERTScorer
import csv

In [None]:
def extract_text_from_pdfs(path):
    """
    Objective: Read all pdf files in the folder
    Input: Path of the folder
    Output: text from all the pdf files (type: list of dictionaries) """
    text_data = []
    for file in os.listdir(path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(path, file)
            reader = PdfReader(pdf_path)
            text = "".join([page.extract_text() for page in reader.pages])
            text_data.append({"filename": file, "text": text})
    return text_data

pdf_text_data = extract_text_from_pdfs("data")
print(f"Number of PDFs:{len(pdf_text_data)}")


Number of PDFs:4


In [41]:
def chunk_text(text_data, max_length=200):
    """
    Objective: To turn the text data from each document into chunks.
    input: text_data (list of documents), max_length -> maximum length of our chunks.
    output: text chunks (type:list of dictionaries)"""

    chunks = []
    for doc in text_data:
        sentences = sent_tokenize(doc["text"])
        chunk = ""
        for sentence in sentences:
            if len(chunk) + len(sentence) >= max_length:
                chunks.append({"filename": doc["filename"], "text": chunk})
                chunk = ""
            chunk += sentence + ". "
        chunks.append({"filename": doc["filename"], "text": chunk})
    return chunks

chunked_data = chunk_text(pdf_text_data)
print(f"Number of chunks:{len(chunked_data)}")


Number of chunks:1454


In [42]:
# Initialize an embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings(chunked_data):
    """
    Objective: Turn each chunk into embeddings (vectors)
    Input: Chunked text data
    Output: Embdeddings for each chunk (type: list of dictionaries)"""
    embeddings = []
    for chunk in chunked_data:
        embedding = model.encode(chunk["text"])
        embeddings.append({
            "filename": chunk["filename"],
            "text": chunk["text"],
            "embedding": embedding
        })
    return embeddings

embeddings = get_embeddings(chunked_data)
print(f"Number of embeddings:{len(embeddings)}")



Number of embeddings:1454


In [43]:
# Adding index for each embedding for fast retrieval

# Initialize FAISS index
dimension = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(dimension)

# Convert embeddings to numpy array and add to index
vectors = np.array([embedding["embedding"] for embedding in embeddings])
index.add(vectors)
print(f"Number of vectors in index:{index.ntotal}")

metadata = []
for embedding in embeddings:
    metadata_item = {key: embedding[key] for key in embedding if key != "embedding"}
    metadata.append(metadata_item)


Number of vectors in index:1454


In [None]:
questions = ['Who is Tenali RamaKrishna?',
'How did Tenali RamaKrishan Died?',
'What is the moral of the story Tenali RamaKrishana and Foolish Thieves',
'What is the moral of the story Tenali RamaKrishana and The Three Dolls',
'What is TenaliRamaKrishna Famous for?',
'Name some works by Tenali RamaKrishana',
'Who is the author of Adventures of Pinocchio',
'How many Chapters are there in Adventures of Pinocchio',
'Why does Pinocchio nose grow when he lies?',
'What makes Pinocchio decide to rescue Geppetto from the whale?',
'What promise does Pinocchio make to the Blue Fairy?',
'Who creates Pinocchio?',
'Who is Pinocchio’s father?',
'Why does the Blue Fairy help Pinocchio throughout his journey?',
'What causes Pinocchio to run away from school?',
'How does Geppetto show his love for Pinocchio?',
'Instead of going to school where does Pinocchio go?',
'How many coins does Pinocchio bury in the field?',
'Who swallows Geppetto in the sea?',
'In Toyland what is Pinocchio turned into?',
'What happens at the end of the story?',
'Who was Pinocchio?',
'What was Pinocchio made of?', 'Where does the story of Pinocchio come from?',
'On his way to school Pinocchio meets two strangers who lead him astray. Who are they?',
'What happens each time Pinocchio tell s a lie?', 'Who was Pinocchio’s conscience?']

In [45]:
reference_answers = [
'Tenali Ramakrishna was a Telugu poet, scholar, and advisor in the court of Sri Krishnadevaraya of the Vijayanagara Empire.' ,
'Died in 1528 reportedly due to a snakebite.',
'Moral: We can overcome any trouble if we do not lose our cool and put our brains to work.',
'Moral:The personality  of the person reflects through his thoughts. If you are crooked, your thoughts will be crooked and if you are straight and fair, your thoughts will be likewise.', 'Tenal RamaKrishna is famous for his wit and humour',
'His notable work, Panduranga Mahatmyam, is considered a great Telugu Kavya. He also composed Udbhataradhya Charitamu, showcasing his deep connection to Shaivism.',
'Author:Carlo Collodi',
'36 chapters',
'Pinocchios nose grows as a punishment and reminder of the consequences of dishonesty.',
'Pinocchio realizes how much he loves and wants to protect Geppetto',
'Pinocchio promises to be good, go to school, and take responsibility for his actions.',
'Geppetto, a kind and lonely woodcarver, creates Pinocchio from a piece of wood.',
'Geppetto is Pinocchio’s father.',
'The Blue Fairy believes in Pinocchio’s potential for goodness and growth.',
'He is tempted by promises of fun and excitement, which lead him astray',
'Geppetto sacrifices his own comfort and safety, even risking his life, for Pinocchio’s well-being.',
'A Puppet Show', 'Four', 'A Shark', 'A Donkey', 'Pinocchio’s wish comes true and he becomes a real boy.',
'A Puppet', 'Wood', 'Italy', 'Fox and Cat', 'His nose grows', 'Jiminy Cricket']

Appraoch 1: Using Just RAG

In [46]:
def retrieve_answer(question, index, embeddings, metadata, top_k=2):
    """
    Objective: retrieve the most relevant chunks based on the question
    Input: question, index, embeddings, metadata, top_k -> number of chunks to retrieve
    Output: answers [list of top k answers]"""

    question_embedding = model.encode([question])
    scores, indices = index.search(np.array(question_embedding), top_k)

    answers = []
    for idx in indices[0]:
        answers.append(metadata[idx]["text"])
    return answers


pure_rag_results = []
for question in questions:
    answers = retrieve_answer(question, index, embeddings, metadata)
    formatted_answer =  " ".join(answers).strip()
    final_answer = formatted_answer.replace("..", ".").replace("\n", " ")
    pure_rag_results.append({
        'Question': question,
        'Answer': final_answer
    })

print(pure_rag_results)

[{'Question': 'Who is Tenali RamaKrishna?', 'Answer': 'Tenali Rama  Krishna   Tenali Ramakrishna  (22 September 1480 – 5 August 1528) was a  Telugu  poet, scholar,  and advisor in the court of  Sri Krishnadevaraya  of the  Vijayanagara Empire .  Early life:   Tenali Ramakrishna was born in Tenali in a Telugu -speaking Brahmin family. Despite lack of formal education, his thirst for knowledge led him to become a renowned  scholar.'}, {'Question': 'How did Tenali RamaKrishan Died?', 'Answer': 'At the end of three days,  none  of the ministers could figure  out the difference . The king got worried and went to visit Tenali Rama . He said to him, “Tenali it is a  matter of our honour now .  Tenali Rama  Krishna   Tenali Ramakrishna  (22 September 1480 – 5 August 1528) was a  Telugu  poet, scholar,  and advisor in the court of  Sri Krishnadevaraya  of the  Vijayanagara Empire .'}, {'Question': 'What is the moral of the story Tenali RamaKrishana and Foolish Thieves', 'Answer': 'Tenali Rama a

Approach 2: Generating refined responses using LLM with RAG answers as input



In [47]:
# Store results for RAG + LLM with Context Chunks

# Initialize your LLM
llm_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
rag_llm_context_results = []
for question in questions:
    context_chunks = retrieve_answer(question, index, embeddings, metadata)
    context = " ".join(context_chunks).strip()
    prompt = f"""
    Answer the question in a complete sentence based on the given context:\nQ: {question} \n
    Context: {context} A:\n
    Make sure the answer is well-structured."""

    response = llm_pipeline(prompt, max_length=500, num_return_sequences=1)
    response = response[0]["generated_text"].strip()

    rag_llm_context_results.append({
        'question': question,
        'answer': response
    })
print(rag_llm_context_results[1])

{'question': 'How did Tenali RamaKrishan Died?', 'answer': '5 August 1528'}


In [48]:
#Structure the answers into a list
retrieve_answers_rag = [result['Answer'] for result in pure_rag_results]
retrieve_answers_llm = [result['answer'] for result in rag_llm_context_results]
print(retrieve_answers_rag)
print(retrieve_answers_llm)

['Tenali Rama  Krishna   Tenali Ramakrishna  (22 September 1480 – 5 August 1528) was a  Telugu  poet, scholar,  and advisor in the court of  Sri Krishnadevaraya  of the  Vijayanagara Empire .  Early life:   Tenali Ramakrishna was born in Tenali in a Telugu -speaking Brahmin family. Despite lack of formal education, his thirst for knowledge led him to become a renowned  scholar.', 'At the end of three days,  none  of the ministers could figure  out the difference . The king got worried and went to visit Tenali Rama . He said to him, “Tenali it is a  matter of our honour now .  Tenali Rama  Krishna   Tenali Ramakrishna  (22 September 1480 – 5 August 1528) was a  Telugu  poet, scholar,  and advisor in the court of  Sri Krishnadevaraya  of the  Vijayanagara Empire .', 'Tenali Rama and The Foolish thieves   A gang of thieves was ransacking every house in the Vijayanagara  kingdom. The  city chief was troubled as his guards were unable to catch the thieves.  The story of how the smart Tenali

In [None]:
#Evaluate the metrics
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
semantic_similarities = []


def semantic_similarity_score(reference, generated, model):
    """
    Objective: measure semantic similarity
    Input: model, reference answers and the generated answers(rag/llm)
    Output: semantic score (float)"""
    ref_embedding = model.encode(reference, convert_to_tensor=True)
    gen_embedding = model.encode(generated, convert_to_tensor=True)
    return util.pytorch_cos_sim(ref_embedding, gen_embedding).item()

def calculate_bert_score(reference, generated):
    """
    Objective: measure bert score
    Input: reference answers and the generated answers(rag/llm)
    Output: Dictionary containing precision, recall and f1 score in float"""
    scorer = BERTScorer(model_type='bert-base-uncased')
    P, R, F1 = scorer.score([generated], [reference])
    return {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }

def evaluate_metrics(questions, reference_answers, generated_answers, topic):
    """
    Objective: Evaluate/Measure all Semantic, Bert and Rouge-l score
    Input: questions, reference answers and the generated answers and topic(rag/llm)
    Output: bert_scores, rouge_l_scores, semantic_similarities score (type: lists)"""
    
    bert_scores = []
    rouge_l_scores = []
    semantic_similarities = []
    # Iterating through each question-answer pair to calculate metrics
    for question, reference, generated in zip(questions, reference_answers, generated_answers):

        # Semantic Similarity
        similarity = semantic_similarity_score(reference, generated, semantic_model)
        semantic_similarities.append(similarity)

        #rouge_l_score
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        l_score = scorer.score(reference, generated)['rougeL']
        rouge_l_scores.append(l_score)

        #bert_score
        score = calculate_bert_score(reference, generated)
        bert_scores.append(score)

    if topic == "rag":
      print("Metrics for RAG:")

    else:
      print("Metrics for RAG + LLM with Context Chunks:")

    print("Semantic Similarities:", semantic_similarities)
    print("ROUGE-L Score:", rouge_l_scores)
    print("BERT Score:", bert_scores)

    return semantic_similarities, rouge_l_scores, bert_scores

semantic_similarities_rag, rouge_l_score_rag, bert_score_rag = evaluate_metrics(questions, reference_answers, retrieve_answers_rag, topic="rag")
semantic_similarities_llm, rouge_l_score_llm, bert_score_llm = evaluate_metrics(questions, reference_answers, retrieve_answers_llm, topic="llm_context")


In [None]:
def semantic_values(semantic_similarities):
    """
    Objective: Calculate Max, Min and average semantic score for analysis"""
    max_semantic_similarity = round(max(semantic_similarities), 4)
    min_semantic_similarity = round(min(semantic_similarities), 4)
    avg_semantic_similarity = round(sum(semantic_similarities) / len(semantic_similarities), 4)
    return max_semantic_similarity, min_semantic_similarity, avg_semantic_similarity

max_semantic_similarity_rag, min_semantic_similarity_rag, avg_semantic_similarity_rag = semantic_values(semantic_similarities_rag)
max_semantic_similarity_llm, min_semantic_similarity_llm, avg_semantic_similarity_llm = semantic_values(semantic_similarities_llm)

def rouge_l_f1_score(rouge_l_score):
    """
    Objective: Calculate Max, Min and average Rouge_l F1 score for analysis"""
    f_measures = [score.fmeasure for score in rouge_l_score]
    max_rouge_l_f1 = round(max(f_measures), 4)
    min_rouge_l_f1 = round(min(f_measures), 4)
    avg_rouge_l_f1 = round(sum(f_measures) / len(f_measures), 4)
    return max_rouge_l_f1, min_rouge_l_f1, avg_rouge_l_f1

max_rouge_l_f1_rag, min_rouge_l_f1_rag, avg_rouge_l_f1_rag = rouge_l_f1_score(rouge_l_score_rag)
max_rouge_l_f1_llm, min_rouge_l_f1_llm, avg_rouge_l_f1_llm = rouge_l_f1_score(rouge_l_score_llm)

def bert_precision_score(bert_scores):
    """
    Objective: Calculate Max, Min and average BERT Precision score for analysis"""
    precisions = [score['precision'] for score in bert_scores]
    max_precision, min_precision, avg_precision = round(max(precisions), 4), round(min(precisions), 4), round(sum(precisions) / len(precisions), 4)
    return max_precision, min_precision, avg_precision

max_bert_precision_rag, min_bert_precision_rag, avg_bert_precision_rag = bert_precision_score(bert_score_rag)
max_bert_precision_llm, min_bert_precision_llm, avg_bert_precision_llm = bert_precision_score(bert_score_llm)

def bert_recall_score(bert_scores):
    """
    Objective: Calculate Max, Min and average BERT Recall score for analysis"""
    recalls = [score['recall'] for score in bert_scores]
    max_recall, min_recall, avg_recall = round(max(recalls), 4), round(min(recalls), 4), round(sum(recalls) / len(recalls), 4)
    return max_recall, min_recall, avg_recall

max_bert_recall_rag, min_bert_recall_rag, avg_bert_recall_rag = bert_recall_score(bert_score_rag)
max_bert_recall_llm, min_bert_recall_llm, avg_bert_recall_llm = bert_recall_score(bert_score_llm)

def bert_f1_score(bert_scores):
    """
    Objective: Calculate Max, Min and average BERT F1 score for analysis"""
    f1s = [score['f1'] for score in bert_scores]
    max_f1, min_f1, avg_f1 = round(max(f1s), 4), round(min(f1s), 4), round(sum(f1s) / len(f1s), 4)
    return max_f1, min_f1, avg_f1
    
max_bert_f1_rag, min_bert_f1_rag, avg_bert_f1_rag = bert_f1_score(bert_score_rag)
max_bert_f1_llm, min_bert_f1_llm, avg_bert_f1_llm = bert_f1_score(bert_score_llm)

data = {
    "Method": ["RAG", "LLM"],
    "Semantic Similarity": [f"{max_semantic_similarity_rag} | {min_semantic_similarity_rag}",
                            f"{max_semantic_similarity_llm} | {min_semantic_similarity_llm}"],
    "Semantic Similarity avg":[f"{avg_semantic_similarity_rag}",
                         f"{avg_semantic_similarity_llm}"],
    "ROUGE-L F1 Score": [f"{max_rouge_l_f1_rag} | {min_rouge_l_f1_rag}",
                         f"{max_rouge_l_f1_llm} | {min_rouge_l_f1_llm}"],
    "ROUGE-L F1 avg":[f"{avg_rouge_l_f1_rag}",
                         f"{avg_rouge_l_f1_llm}"],
    "BERT Precision": [f"{max_bert_precision_rag} | {min_bert_precision_rag}",
                      f"{max_bert_precision_llm} | {min_bert_precision_llm}"],
    "BERT Precision avg":[f"{avg_bert_precision_rag}",
                         f"{avg_bert_precision_llm}"],
    "BERT Recall": [f"{max_bert_recall_rag} | {min_bert_recall_rag}",
                    f"{max_bert_recall_llm} | {min_bert_recall_llm}"],
    "BERT Recall avg":[f"{avg_bert_recall_rag}",
                         f"{avg_bert_recall_llm}"],
    "BERT F1 Score": [f"{max_bert_f1_rag} | {min_bert_f1_rag}",
                      f"{max_bert_f1_llm} | {min_bert_f1_llm}"],
    "BERT F1 avg":[f"{avg_bert_f1_rag}",
                         f"{avg_bert_f1_llm}"]
}

metrics_df = pd.DataFrame(data)


print(metrics_df)

# Save to a CSV file
metrics_df.to_csv("evaluation_metrics.csv", index=False)

In [None]:
#Data Prep for Human Evaluation
#Compare the reference answers and generated answer and label them
#Check for wrong answers and check if there are any patterns.
#analyse and suggestions for improvements.

with open('results/rag_results.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Question', 'Reference Answer', 'RAG Answer'])
    for question, ref_answer, rag_answer in zip(questions, reference_answers, retrieve_answers_rag):
        writer.writerow([question, ref_answer, rag_answer])


with open('results/llm_results.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Question', 'Reference Answer', 'LLM Answer'])
    for question, ref_answer, llm_answer in zip(questions, reference_answers, retrieve_answers_llm):
        writer.writerow([question, ref_answer, llm_answer])


In [None]:
#Inferential Statistics -Bootstrap Sampling

def bootstrap_sampling_bert_score(bert_scores,type, num_samples=1000, seed=42):
    """
    Objective: Perform Bootstrap Ananlysis using Bert Scores"""
    np.random.seed(seed)

    precision_scores = []
    recall_scores = []
    f1_scores = []

    num_examples = len(bert_scores)

    # Calculate BERTScore for bootstrap samples
    for _ in range(num_samples):
        indices = np.random.choice(range(num_examples), size=num_examples, replace=True)
        sampled_precision = [bert_scores[i]["precision"] for i in indices]
        sampled_recall = [bert_scores[i]["recall"] for i in indices]
        sampled_f1 = [bert_scores[i]["f1"] for i in indices]

        precision_scores.append(np.mean(sampled_precision))
        recall_scores.append(np.mean(sampled_recall))
        f1_scores.append(np.mean(sampled_f1))

    # Calculate confidence intervals
    lower_percentile = 2.5
    upper_percentile = 97.5

    precision_conf_int = (np.percentile(precision_scores, lower_percentile), np.percentile(precision_scores, upper_percentile))
    recall_conf_int = (np.percentile(recall_scores, lower_percentile), np.percentile(recall_scores, upper_percentile))
    f1_conf_int = (np.percentile(f1_scores, lower_percentile), np.percentile(f1_scores, upper_percentile))

    # Create a DataFrame to save results
    results_df = pd.DataFrame({
        'Metric': ['Precision', 'Recall', 'F1 Score'],
        'Mean Score': [np.mean(precision_scores), np.mean(recall_scores), np.mean(f1_scores)],
        'Confidence Interval Lower': [precision_conf_int[0], recall_conf_int[0], f1_conf_int[0]],
        'Confidence Interval Upper': [precision_conf_int[1], recall_conf_int[1], f1_conf_int[1]],
    })

    if type == 'rag':  
        results_df.to_csv('results/bert_bootstrap_result_rag.csv', index=False)
    else:
        results_df.to_csv('results/bert_bootstrap_result_llm.csv', index=False)
    return precision_scores, recall_scores, f1_scores, precision_conf_int, recall_conf_int, f1_conf_int

precision_scores_rag, recall_scores_rag, f1_scores_rag, precision_conf_int_rag, recall_conf_int_rag, f1_conf_int_rag = bootstrap_sampling_bert_score(bert_score_rag, type='rag')

precision_scores_llm, recall_scores_llm, f1_scores_llm, precision_conf_int_llm, recall_conf_int_llm, f1_conf_int_llm = bootstrap_sampling_bert_score(bert_score_llm, type='llm')

