In [1]:
!pip install transformers torch sentencepiece pdfminer.six numpy pandas tqdm requests sentence-transformers langchain qdrant-client openai


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 1.5 MB/s eta 0:00:01
Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 2.3 MB/s eta 0:00:01
[?25hCollecting qdrant-client
  Using cached qdrant_client-1.14.2-py3-none-any.whl (327 kB)
Collecting langsmith<0.4,>=0.1.17
  Downloading langsmith-0.3.42-py3-none-any.whl (360 kB)
[K     |████████████████████████████████| 360 kB 1.2 MB/s eta 0:00:01
[?25hCollecting SQLAlchemy<3,>=1.4
  Downloading sqlalchemy-2.0.41-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 1.2 MB/s eta 0:00:01
[?25hCollecting langchain-text-splitters<1.0.0,>=0.3.8
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl (32 kB)
Collecting langchain-core<1.0.0,>=0.3.58
  Downloading langchain_core-0.3.60-py3-none-any.whl (437 kB)
[K     |███████

In [2]:
import numpy as np
import os
import pandas as pd
import re
import time
import os
from pdfminer.high_level import extract_text
import os
import pandas as pd
import re
from datetime import datetime
from tqdm import tqdm
from pathlib import Path
import os
import requests
import logging
from datetime import datetime
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/multilingual-e5-large")


### **Processing Pdfs Text**

In [4]:
def extract_and_split_pdfs(pdf_folder_path, chunk_size=400, chunk_overlap=50):
    """
    Extract text from PDFs using pdfminer and split using LangChain's splitter.
    """
    splitter = RecursiveCharacterTextSplitter(
        separators=["۔", "\n", ",", " "],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    all_chunks = []
    filenames = []

    pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]
    for filename in pdf_files:
        pdf_path = os.path.join(pdf_folder_path, filename)
        filenames.append(filename)

        # Extract text using pdfminer
        try:
            text = extract_text(pdf_path)
        except Exception as e:
            print(f"Error extracting {filename}: {e}")
            continue

        chunks = splitter.split_text(text)
        all_chunks.extend(chunks)

    return all_chunks, filenames

In [5]:
pdf_folder_path = "urdu_pdfs"
chunks, filenames = extract_and_split_pdfs(pdf_folder_path)
print(f"Extracted {len(chunks)} text chunks total from {len(filenames)} PDF(s).")

Extracted 635 text chunks total from 2 PDF(s).


In [6]:
chunks[0]

'2008\n\n2008\n\n2008\n\n ﮐو ، ﺳرﮐﺎری ﻋﮩدﯾداروں ﻧﮯ ﮨزاروں ﮔرے ﮨوﺋﮯ اﺳﮑوﻟوں ﮐﮯ ﮐﮭﻧڈرات ﮐﺎ ﻣﻌﺎﺋﻧہ ﮐرﻧﺎ ﺷروع ﮐﯾﺎ ، اس \n\n ﮐو ﮔﻠوب اﯾﻧڈ ﻣﯾل ڈاٹ ﮐﺎم ﮐﮯ ﺟﯾﻔری ﯾﺎرک ﻧﮯ اطﻼع دی ﮐہ ﺧراب طرﯾﻘﮯ ﺳﮯ ﺗﻌﻣﯾر ﺷده ﻋﻣﺎرﺗوں ﮐو \n\n29 ﻣﺋﯽ \nﺑﺎرے ﻣﯾں اﺷﺎرے ﺗﻼش ﮐرﻧﮯ ﮐﮯ ﻟﺋﮯ ﮐہ وه ﮐﯾوں ﮔر ﮔﺋﮯ'

## **Creating and Storing Embeddings of Documents in Qdrant DB**

In [7]:
document_embeddings = model.encode(chunks).astype("float32")
print("Embeddings shape:", document_embeddings.shape)

Embeddings shape: (635, 1024)


In [8]:

client = QdrantClient(path="new_local_qdrant_vectordb.db")

collection_name = "docus_chunks"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=document_embeddings.shape[1],  # Dimension from the embedding model
        distance="Cosine"                   # 'Cosine' is typical for sentence embeddings
    )
)

print(f"Collection '{collection_name}' has been created or recreated successfully!")


Collection 'docus_chunks' has been created or recreated successfully!


  client.recreate_collection(


In [9]:
points = []
for i, embedding_vector in enumerate(document_embeddings):
    points.append(
        PointStruct(
            id=i,  # unique ID for each chunk
            vector=embedding_vector.tolist(),
            payload={
                # Store the actual chunk text, or any other metadata you need
                "chunk_text": chunks[i]
            }
        )
    )

# Upsert (insert/update) the chunk embeddings into Qdrant
client.upsert(collection_name=collection_name, points=points)

print(f"Upserted {len(points)} chunk embeddings into Qdrant!")

Upserted 635 chunk embeddings into Qdrant!


## **Creating and Storing Embeddings of Q&As in Qdrant DB**

### **Uploading already Generated Q&As**

In [10]:
import pandas as pd

def load_urdu_qa_pairs(csv_path):
    """
    Loads Urdu QA pairs from a CSV file and returns two lists: questions and answers.

    Args:
        csv_path (str): Path to the CSV file.

    Returns:
        tuple: (questions, answers), where both are lists of strings.
    """
    df = pd.read_csv(csv_path)
    
    # Ensure the required columns exist
    if 'question' not in df.columns or 'answer' not in df.columns:
        raise ValueError("CSV must contain 'question' and 'answer' columns.")

    questions = df['question'].dropna().tolist()
    answers = df['answer'].dropna().tolist()

    return questions, answers


In [11]:
questions, answers = load_urdu_qa_pairs("urdu_qa_pairs_updated_openai_4o.csv")
print(questions[:5])
print(answers[:5])

['29 مئی 2008 کو سرکاری عہدیداروں نے کیا اقدام کیا؟', 'والدین نے مقامی عہدیداروں اور بلڈرز پر کیا الزام لگایا؟', 'والدین نے زلزلے کے بعد دیگر عمارتوں کے بارے میں کیا کہا؟', 'زلزلے کے بعد سرکاری طور پر کیا وعدہ کیا گیا؟', '17 جولائی 2008 تک والدین کی کیا شکایت تھی؟']
['سرکاری عہدیداروں نے ہزاروں گرے ہوئے اسکولوں کے کھنڈرات کا معائنہ کرنا شروع کیا۔', 'والدین نے الزام لگایا کہ اسکول کی تعمیر میں کونوں کونوں کاٹنے کی وجہ سے اسکول گرے۔', 'والدین نے کہا کہ زلزلے کے بعد قریبی دیگر عمارتوں کو بہت کم نقصان پہنچا تھا۔', 'زلزلے کے بعد بہت سی مقامی حکومتوں نے سرکاری طور پر اسکول کے گرنے کی تحقیقات کرنے کا وعدہ کیا۔', 'والدین نے شکایت کی کہ انہیں ابھی تک کوئی رپورٹ موصول نہیں ہوئی۔']


**Creating Collection in same Local_Qdrant DB**

In [12]:
# Function to create a collection in Qdrant
def create_collection(client, collection_name="faq_embeddings"):
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1024, distance="Cosine")  # For all-MiniLM-L6-v2
    )

    print(f"Collection '{collection_name}' has been created or recreated successfully!")

### **Generating Embeddings and Storing**

In [13]:
# Function to generate embeddings for a list of questions
def generate_embeddings(questions):
    model = SentenceTransformer("intfloat/multilingual-e5-large")
    embeddings = model.encode(questions)
    return embeddings

In [14]:
# Function to store embeddings and metadata (answers) in Qdrant
def store_embeddings_in_qdrant(client, collection_name, questions, answers, embeddings):
    points = [
        PointStruct(
            id=i,
            vector=embeddings[i].tolist(),  # Convert numpy array to list
            payload={"answer": answers[i]}  # Store metadata (only answers)
        ) for i in range(len(questions))
    ]
    client.upsert(collection_name=collection_name, points=points)


In [15]:

collection_name = "faq_embeddings"
create_collection(client, collection_name)

# Generate embeddings for questions
embeddings = generate_embeddings(questions)

# Store embeddings and answers in Qdrant
store_embeddings_in_qdrant(client, collection_name, questions, answers, embeddings)

Collection 'faq_embeddings' has been created or recreated successfully!


  client.recreate_collection(


# **Finding Semantic Similarity of Query with Pdf Text**

In [16]:
def query_similarity_docus_chunks(client, collection_name, query, top_k=3):
    """
    Search for the top_k most similar chunks in a Qdrant collection
    that stores 'chunk_text' as payload.
    
    Args:
        client: QdrantClient instance connected to the relevant DB file.
        collection_name (str): The name of the collection containing chunk vectors.
        query (str): The user query string.
        top_k (int): Number of top similar chunks to retrieve.

    Returns:
        chunks (List[str]): The retrieved chunk texts.
        scores (List[float]): The similarity scores.
    """
    # Use the same embedding model you used to store the chunks


    model = SentenceTransformer("intfloat/multilingual-e5-large")

    
    query_embedding = model.encode([query])[0].tolist()

    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )

    # Assuming you stored your PDF text in payload["chunk_text"]
    chunks = [result.payload["chunk_text"] for result in search_results]
    scores = [result.score for result in search_results]

    return chunks , scores


In [17]:
doc_collection_name = "docus_chunks" 
query = "19 مئی 2008 کو کون سی تقریب کا اعلان کیا گیا؟"

chunks, chunk_scores = query_similarity_docus_chunks(client, doc_collection_name, query, top_k=3)

print("Top 3 Retrieved Chunks:\n")
for i, (chunk, score) in enumerate(zip(chunks, chunk_scores), 1):
    print(f"Chunk {i} (score={score:.4f}):\n{chunk}\n{'-'*50}")


Top 3 Retrieved Chunks:

Chunk 1 (score=0.8394):
۔ 27 ﻣﺋﯽ ، 
ﭘﺎﺋﯽ ﺗﮭﯾں ﺟو درﯾﺎؤں ﮐو ﻣﺳدود اور ڈﯾم ﮐررﮨﯽ ﺗﮭﯾں ، اور ﯾہ اﻧدازه ﻟﮕﺎﯾﺎ ﮔﯾﺎ ﺗﮭﺎ ﮐہ ان ﻣﯾں ﺳﮯ 28 اب ﺑﮭﯽ ﻣﻘﺎﻣﯽ ﻟوﮔوں 
ﮐﮯ ﻟﺋﮯ ﻣﻣﮑﻧہ ﺧطره ﮨﯾں۔ اس ﮐﮯ ﻧﺗﯾﺟﮯ ﻣﯾں ﺳﯾﻼب ﮐﯽ وﺟہ ﺳﮯ ﻣﮑﻣل دﯾﮩﺎت ﮐو ﺧﺎﻟﯽ ﮐرﻧﺎ ﭘڑا۔ 
2008
رﯾﺎﺳﺗﯽ ﮐوﻧﺳل ﻧﮯ زﻟزﻟﮯ ﮐﮯ ﻣﺗﺎﺛرﯾن ﮐﮯ ﻟﺋﮯ 19 ﻣﺋﯽ 
اﻋﻼن ﮐﯾﺎ
--------------------------------------------------
Chunk 2 (score=0.8253):
۔ ﭼﯾن ﮐﺎ ﻗوﻣﯽ ﺟﮭﻧڈا 
ﺳﭨﯾٹ ﮐوﻧﺳل ﻧﮯ 19 ﻣﺋﯽ 
اور ﮨﺎﻧﮓ ﮐﺎﻧﮓ اور ﻣﮑﺎؤ ﮐﮯ ﺧﺻوﺻﯽ اﻧﺗظﺎﻣﯽ ﻋﻼﻗوں ﮐﮯ ﻋﻼﻗﺎﺋﯽ ﺟﮭﻧڈے آدھﮯ ﻣﺳت ﭘر ﻟﮩراﺋﮯ ﮔﺋﮯ۔ ﯾہ ﭘﮩﻠﯽ ﺑﺎر ﺗﮭﺎ 
ﮐہ ﮐﺳﯽ رﯾﺎﺳﺗﯽ رﮨﻧﻣﺎ ﮐﯽ ﻣوت ﮐﮯ ﻋﻼوه ﮐﺳﯽ اور ﭼﯾز ﮐﮯ ﻟﺋﮯ ﻗوﻣﯽ ﺳوگ ﮐﺎ اﻋﻼن ﮐﯾﺎ ﮔﯾﺎ ﺗﮭﺎ ، اور ﺑﮩت ﺳﮯ ﻟوﮔوں ﻧﮯ 
2008
اﺳﮯ ﻣﺎؤ زے ڈوﻧﮓ ﮐﯽ ﻣوت ﮐﮯ ﺑﻌد ﺳوگ ﮐﺎ ﺳب ﺳﮯ ﺑڑا ﻣظﺎﮨره ﻗرار دﯾﺎ ﮨﮯ
--------------------------------------------------
Chunk 3 (score=0.8207):
۔ ﺑﮩت ﺳﮯ ﻟوﮔوں ﻧﮯ ﻣوﺑﺎﺋل ﻓون ﭘر ﭨﯾﮑﺳٹ ﻣﯾﺳﺟﻧﮓ ﮐﮯ ذرﯾﻌﮯ ﭼﯾن ﯾوﻧﯾﮑوم اور ﭼﺎﺋﻧﺎ 
ﻣوﺑﺎﺋل ﮐﮯ ذرﯾﻌہ ﻗﺎﺋم ﮐرده اﮐﺎؤﻧﭨس ﻣﯾں ﻋطﯾہ ﮐﯾﺎ۔ 16 ﻣﺋﯽ ﺗﮏ ، ﭼﯾﻧﯽ ﺣﮑوﻣت ﻧﮯ زﻟزﻟﮯ ﮐﯽ اﻣداد ﮐﮯ ﻟﺋﮯ اب ﺗﮏ 
ﻣﺟﻣوﻋﯽ طور ﭘر 772 ﻣﻠﯾن ڈاﻟ

  search_results = client.search(


# **Finding Semantic Similarity of Query with Q&As**

In [18]:
# Function to query Qdrant for top-k similar embeddings
def query_similar_embeddings(client, collection_name, query, top_k=5):
    model = SentenceTransformer("intfloat/multilingual-e5-large")

    query_embedding = model.encode([query])[0].tolist()  # Generate query embedding

    # Perform the search to get the most similar vectors
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k  # Number of nearest neighbors to retrieve
    )

    # Retrieve answers from the search results
    answers = [result.payload['answer'] for result in search_results]
    scores = [result.score for result in search_results]

    return answers , scores

In [19]:
# Query similar questions to a given query
query = "19 مئی 2008 کو کون سی تقریب کا اعلان کیا گیا؟"

retrieved_answers , scores = query_similar_embeddings(client, collection_name, query)


# Display retrieved answers
print("Retrieved Answers")
for i, answer in enumerate(retrieved_answers, 1):
    print(f"{i}: {answer}")

  search_results = client.search(


Retrieved Answers
1: اسٹیٹ کونسل نے زلزلے کے متاثرین کے لیے تین روزہ قومی سوگ کا اعلان کیا۔
2: 27 مئی، 2008 تک زلزلے کے ملبے کی وجہ سے 34 جھیلیں تشکیل پائی تھیں جو دریاؤں کو مسدود اور ڈیم کر رہی تھیں۔
3: سرکاری عہدیداروں نے ہزاروں گرے ہوئے اسکولوں کے کھنڈرات کا معائنہ کرنا شروع کیا۔
4: 7 اپریل 2008 کو تین کارکنوں نے تبتی جھنڈے لے کر گولڈن گیٹ برج کی معطلی کیبلوں پر چڑھائی کی۔
5: 8 اپریل کو متعدد احتجاجوں کی منصوبہ بندی کی گئی تھی۔


In [20]:
print("Retrieved Scores")
for i, answer in enumerate(scores, 1):
    print(f"{i}: {answer}")

Retrieved Scores
1: 0.934968578690951
2: 0.9125134389921301
3: 0.9104740274318642
4: 0.900152235411595
5: 0.8996248410774526


### **Generating response from Retrived_Answers using Alif 8b**

In [21]:
!pip install openai

Python(9463) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Users/mac/Documents/LMA-RAG Code/LMA-RAG Thesis Code/OpenAI FAQ/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [22]:
!pip install --upgrade openai


Python(9465) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Users/mac/Documents/LMA-RAG Code/LMA-RAG Thesis Code/OpenAI FAQ/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
from openai import OpenAI

# Initialize OpenAI client with API key
openai_client = OpenAI(api_key= "sk-")  # or use environment variable

In [24]:
def generate_using_openai(context, query, model_name="gpt-4o"):
    prompt = f"""
You are given a **Urdu question** and a relevant **Urdu context**. Read the context carefully and generate a **correct, concise answer in Urdu**, based **only** on the information in the context.

Instructions:
- Use only the information provided in the context.
- Do **not** add any external or inferred information.
- Answer should be brief, accurate, and in **Urdu only**.

### Example:

**Question:**
19 مئی 2008 کو کون سی تقریب کا اعلان کیا گیا؟

**Context:**
چین کا قومی جھنڈا سٹیٹ کونسل نے 19 مئی کو آدھے مست پر لہرایا۔ یہ پہلی بار تھا کہ کسی ریاستی رہنما کی موت کے علاوہ کسی اور چیز کے لیے قومی سوگ کا اعلان کیا گیا تھا۔ ریاستی کونسل نے زلزلے کے متاثرین کے لیے 19 مئی کو اعلان کیا۔

**Answer:**
19 مئی 2008 کو زلزلے کے متاثرین کے لیے تین روزہ قومی سوگ کا اعلان کیا گیا۔

---

Now answer the following:

**Question:**
{query}

**Context:**
{context}

**Answer (in Urdu):**
"""

    try:
        response = openai_client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print("❌ Error from OpenAI API:", e)
        return ""


In [25]:
retrieved_answers

['اسٹیٹ کونسل نے زلزلے کے متاثرین کے لیے تین روزہ قومی سوگ کا اعلان کیا۔',
 '27 مئی، 2008 تک زلزلے کے ملبے کی وجہ سے 34 جھیلیں تشکیل پائی تھیں جو دریاؤں کو مسدود اور ڈیم کر رہی تھیں۔',
 'سرکاری عہدیداروں نے ہزاروں گرے ہوئے اسکولوں کے کھنڈرات کا معائنہ کرنا شروع کیا۔',
 '7 اپریل 2008 کو تین کارکنوں نے تبتی جھنڈے لے کر گولڈن گیٹ برج کی معطلی کیبلوں پر چڑھائی کی۔',
 '8 اپریل کو متعدد احتجاجوں کی منصوبہ بندی کی گئی تھی۔']

In [26]:
query = "19 مئی 2008 کو کون سی تقریب کا اعلان کیا گیا؟"
if __name__ == "__main__":

    # Generate an answer using the local Ollama server
    final_answer = generate_using_openai(retrieved_answers, query)
    print("Generated Answer:", final_answer)

Generated Answer: 19 مئی 2008 کو زلزلے کے متاثرین کے لیے تین روزہ قومی سوگ کا اعلان کیا گیا۔


# **RAG Pipelines**

In [27]:
def traditional_rag_pipeline(client, trad_collection_name, query):
    """
    Executes the traditional RAG approach.
    Returns results, retrieval time, and generation time.
    """
    start_retrieval = time.time()
    retrieved_context , similarity_scores = query_similarity_docus_chunks(client, trad_collection_name, query, top_k=3)  # Retrieve relevant sentences
    end_retrieval = time.time()
    retrieval_time = end_retrieval - start_retrieval

    start_answer = time.time()
    generated_answer = generate_using_openai(retrieved_context, query)  # Generate answer
    end_answer = time.time()
    generation_time = end_answer - start_answer

    total_time = retrieval_time + generation_time

    result = {
        "retrieved_context": retrieved_context,
        "retrieval_time": retrieval_time,
        "answer": generated_answer,  # Make sure this key is correctly returned
        "generation_time": generation_time,
        "total_time": total_time,
        "similarity score": similarity_scores
    }

    # Debug: Print the result to inspect the return value
    print("Traditional RAG Result Generated")

    return result


In [28]:
def mod_rag_pipeline(query, client, mod_collection_name, top_k=3):
    """
    Executes the second RAG approach using query embeddings.
    Returns results, retrieval time, and other metadata.
    """
    start_retrieval = time.time()
    answers, scores = query_similar_embeddings(client, mod_collection_name, query, top_k)  # Now returns two lists
    end_retrieval = time.time()
    retrieval_time = end_retrieval - start_retrieval

    if answers:
        # Combine answers with their corresponding similarity scores for context
        retrieved_documents = [f"Answer: {answers[i]}\nScore: {scores[i]:.2f}" for i in range(len(answers))]

        # Generate answer based on retrieved answers
        start_answer = time.time()
        generated_answer = generate_using_openai(answers, query)
        end_answer = time.time()
        answer_time = end_answer - start_answer

        total_time = retrieval_time + answer_time

        return {
            "retrieved_documents": retrieved_documents,
            "similarity_scores": scores,
            "retrieval_time": retrieval_time,
            "generated_answer": generated_answer,
            "generation_time": answer_time,
            "total_time": total_time
        }
    else:
        return {
            "retrieved_documents": [],
            "similarity_scores": [],
            "retrieval_time": retrieval_time,
            "generated_answer": None,
            "generation_time": 0,
            "total_time": retrieval_time
        }

In [29]:
def rag_pipeline(query, client, trad_collection_name, mod_collection_name, similarity_threshold=0.8, top_k=3):
    """
    Main pipeline that first tries query embeddings (approach 2).
    Falls back to traditional RAG if similarity scores are below the threshold.
    """
    # Step 1: Try query embeddings (Approach 2)
    query_similar_results = mod_rag_pipeline(query, client, mod_collection_name, top_k)

    # Check similarity scores
    if query_similar_results["similarity_scores"]:
        max_similarity = max(query_similar_results["similarity_scores"])

        if max_similarity >= similarity_threshold:
            # Return results from the query embeddings approach
            return {
                "case": "Q-A Index",
                "answer": query_similar_results["generated_answer"],
                "retrieved_context": "\n".join(query_similar_results["retrieved_documents"]),
                "retrieval_time": query_similar_results["retrieval_time"],
                "generated_context": query_similar_results["generated_answer"],
                "generation_time": query_similar_results["generation_time"],
                "total_time": query_similar_results["total_time"],
            }

    # Step 2: Fallback to traditional RAG if similarity is low or no results
    traditional_results = traditional_rag_pipeline(client, trad_collection_name, query)

    return {
        "case": "Traditional RAG",
        "answer": traditional_results["answer"],
        "retrieved_context": "\n".join(traditional_results["retrieved_context"]),
        "retrieval_time": traditional_results["retrieval_time"],
        "generation_time": traditional_results["generation_time"],
        "total_time": traditional_results["total_time"],
    }


In [30]:
# Execute the RAG pipeline
results = rag_pipeline(query="والدین نے 17 جولائی 2008 تک کس چیز کی شکایت کی؟", client=client, trad_collection_name="docus_chunks", mod_collection_name="faq_embeddings", similarity_threshold=0.8, top_k=3)

# Print Results
print(f"Case: {results['case']}\n")
print(f"Answer: {results['answer']}\n")
print(f"Retrieved Context: {results['retrieved_context']}\n")
print(f"Retrieval Time: {results['retrieval_time']:.2f}s\n")
print(f"Generation Time: {results['generation_time']:.2f}s\n")
print(f"Total Time: {results['total_time']:.2f}s")


  search_results = client.search(


Case: Q-A Index

Answer: والدین نے شکایت کی کہ انہیں ابھی تک کوئی رپورٹ موصول نہیں ہوئی۔

Retrieved Context: Answer: والدین نے شکایت کی کہ انہیں ابھی تک کوئی رپورٹ موصول نہیں ہوئی۔
Score: 0.99
Answer: احتجاج کو محدود کرنے کے لئے عہدیداروں نے والدین کو ایک دستاویز پر دستخط کرنے پر مجبور کیا جس میں انہیں احتجاج کرنے سے منع کیا گیا۔
Score: 0.87
Answer: عہدیداروں نے والدین کو ایک دستاویز پر دستخط کرنے پر مجبور کیا جس میں انہیں احتجاج کرنے سے منع کیا گیا۔
Score: 0.87

Retrieval Time: 79.28s

Generation Time: 2.44s

Total Time: 81.72s


### Uploading Ground Truth CSVs and merging results into 1 Dataframe

In [37]:

# def convert_xlsx_to_csv(xlsx_path, csv_path):
#     """
#     Convert an Excel .xlsx file to a .csv file.

#     Args:
#         xlsx_path (str): Path to the input .xlsx file.
#         csv_path (str): Path where the output .csv will be saved.
#     """
#     df = pd.read_excel(xlsx_path)
#     df.to_csv(csv_path, index=False, encoding='utf-8-sig')

In [38]:
# convert_xlsx_to_csv("Urdu_qa_groundtruths.xlsx", "Urdu_qa_groundtruths.csv")


In [31]:
import pandas as pd

csv_path = "Urdu_qa_groundtruths.csv"
merged_df = pd.read_csv(csv_path)



In [32]:
def get_question_answer_columns(filtered_df):
    # Select only 'question' and 'answers' columns from the merged DataFrame
    if 'question' in merged_df.columns and 'answer' in merged_df.columns:
        return merged_df[['question', 'answer']]
    else:
        raise ValueError("The columns 'question' and/or 'answer' do not exist in the DataFrame.")

In [33]:
new_df=get_question_answer_columns(merged_df)

In [34]:
new_df

Unnamed: 0,question,answer
0,3 اپریل کو اولمپک مشعل کس شہر میں تھی؟,استنبول
1,چینی حکومت نے بائیکاٹ کی صورتحال کو کم کرنے کے...,سینسرشپ
2,2008 کے اولمپکس کے لئے پہلے مشعل بردار کا نام ...,الیکسینڈروس نکولائڈس
3,کون سی حکومت مشعل کے راستے کی وضاحت کرنے کے لئ...,تائیوان
4,قازقستان میں روٹ کے لئے کلومیٹر میں فاصلہ کیا ...,20
...,...,...
894,زلزلے کی فوکل گہرائی کتنی تھی؟,19 کلومیٹر
895,سیچوان زلزلہ کس سال ہوا تھا؟,2008
896,14 مئی تک کتنی رقم عطیہ کی گئی تھی؟,10.7 بلین یوآن
897,زلزلہ دن کے کس وقت ہوا؟,02:28:01 چین اسٹینڈرڈ ٹائم


In [35]:
# Drop rows with NaN in either 'question' or 'answers' column
new_df = new_df.dropna(subset=['question', 'answer']).reset_index(drop=True)

# View the cleaned DataFrame
print(new_df)


                                              question  \
0               3 اپریل کو اولمپک مشعل کس شہر میں تھی؟   
1    چینی حکومت نے بائیکاٹ کی صورتحال کو کم کرنے کے...   
2    2008 کے اولمپکس کے لئے پہلے مشعل بردار کا نام ...   
3    کون سی حکومت مشعل کے راستے کی وضاحت کرنے کے لئ...   
4    قازقستان میں روٹ کے لئے کلومیٹر میں فاصلہ کیا ...   
..                                                 ...   
894                     زلزلے کی فوکل گہرائی کتنی تھی؟   
895                       سیچوان زلزلہ کس سال ہوا تھا؟   
896                14 مئی تک کتنی رقم عطیہ کی گئی تھی؟   
897                            زلزلہ دن کے کس وقت ہوا؟   
898                 تائیوان سے چارٹرڈ فلائٹ کہاں اتری؟   

                         answer  
0                       استنبول  
1                       سینسرشپ  
2          الیکسینڈروس نکولائڈس  
3                       تائیوان  
4                            20  
..                          ...  
894                  19 کلومیٹر  
895                        2008

## **Generating results for all Ground Truth Q&As**

In [36]:
import pandas as pd
import time

In [45]:
# import torch

# if torch.cuda.is_available():
#     print("GPU is available:", torch.cuda.get_device_name(0))
# else:
#     print("CUDA not available, running on CPU")


In [46]:
# import torch
# print(torch.__version__)
# print(torch.version.cuda)  # Shows which CUDA version PyTorch was built with (None if CPU-only)


In [47]:
# import torch
# print("PyTorch CUDA available?", torch.cuda.is_available())
# if torch.cuda.is_available():
#     print("GPU name:", torch.cuda.get_device_name(0))


### **Final Answers Generate** ###

In [None]:
new_df['modified_rag_refined_answer'] = ""
new_df['modified_rag_case'] = ""
new_df['modified_rag_retrieval_time'] = ""
new_df['modified_rag_generation_time'] = ""
new_df['modified_rag_total_time'] = ""

new_df['traditional_rag_retrieved_context'] = ""
new_df['traditional_rag_refined_answer'] = ""
new_df['traditional_rag_retrieval_time'] = ""
new_df['traditional_rag_generation_time'] = ""
new_df['traditional_rag_total_time'] = ""

output_file = 'Final_Answers_Generated_OpenAI.csv'

it = 0
for index, row in new_df.iterrows():
    query = row['question']  # Extract the question from the CSV

    # Pass the query through the RAG pipeline
    rag_result = rag_pipeline(
        query, 
        client=client, 
        trad_collection_name="docus_chunks", 
        mod_collection_name="faq_embeddings", 
        similarity_threshold=0.8, 
        top_k=3
    )

    # Extract RAG results
    retrival_time = rag_result["retrieval_time"]
    generation_time = rag_result["generation_time"]
    total_time = rag_result["total_time"]
    retrieved_context = rag_result["retrieved_context"]
    refined_answer = rag_result["answer"]
    case = rag_result["case"]

    # Update the DataFrame directly for the current row
    new_df.at[index, 'modified_rag_retrieved_context'] = retrieved_context
    new_df.at[index, 'modified_rag_refined_answer'] = refined_answer
    new_df.at[index, 'modified_rag_case'] = case
    new_df.at[index, 'modified_rag_retrieval_time'] = f"{retrival_time:.2f}"
    new_df.at[index, 'modified_rag_generation_time'] = f"{generation_time:.2f}"
    new_df.at[index, 'modified_rag_total_time'] = f"{total_time:.2f}"

    # Traditional RAG approach
    traditional_results = traditional_rag_pipeline(client, "docus_chunks", query)
    trad_rag_retrieved_context_str = "\n\n".join(traditional_results["retrieved_context"])
    trad_refined_answer = traditional_results["answer"]
    trad_retrieval_time = traditional_results["retrieval_time"]
    trad_generation_time = traditional_results["generation_time"]
    trad_total_time = traditional_results["total_time"]

    # Update DataFrame columns for traditional RAG
    new_df.at[index, 'traditional_rag_retrieved_context'] = trad_rag_retrieved_context_str
    new_df.at[index, 'traditional_rag_refined_answer'] = trad_refined_answer
    new_df.at[index, 'traditional_rag_retrieval_time'] = f"{trad_retrieval_time:.2f}"
    new_df.at[index, 'traditional_rag_generation_time'] = f"{trad_generation_time:.2f}"
    new_df.at[index, 'traditional_rag_total_time'] = f"{trad_total_time:.2f}"


    new_df.to_csv(output_file, index=False,encoding="utf-8-sig")

    it += 1
    print(f"Processed {it} queries. Saved partial results to {output_file}")

print("All queries processed. Final results saved to:", output_file)

  search_results = client.search(
  search_results = client.search(


Traditional RAG Result Generated
Processed 1 queries. Saved partial results to Final_Answers_Generated_OpenAI.csv


  search_results = client.search(


Traditional RAG Result Generated
Processed 2 queries. Saved partial results to Final_Answers_Generated_OpenAI.csv
Traditional RAG Result Generated
Processed 3 queries. Saved partial results to Final_Answers_Generated_OpenAI.csv
Traditional RAG Result Generated
Processed 4 queries. Saved partial results to Final_Answers_Generated_OpenAI.csv
Traditional RAG Result Generated
Processed 5 queries. Saved partial results to Final_Answers_Generated_OpenAI.csv
Traditional RAG Result Generated
Processed 6 queries. Saved partial results to Final_Answers_Generated_OpenAI.csv
