## **Generating Q&As from Documents**

In [1]:
!pip install pypdf openai tqdm pdfminer.six
!pip install PyMuPDF sentence-transformers pandas transformers accelerate
!pip install ollama
!pip install openai
!pip install tiktoken


Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl (303 kB)
[K     |████████████████████████████████| 303 kB 241 kB/s eta 0:00:01
[?25hCollecting openai
  Downloading openai-1.79.0-py3-none-any.whl (683 kB)
[K     |████████████████████████████████| 683 kB 146 kB/s eta 0:00:01
[?25hCollecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 57 kB/s eta 0:00:011
Collecting httpx<1,>=0.23.0
  Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Collecting anyio<5,>=3.5.0
  Using cached anyio-4.9.0-py3-none-any.whl (100 kB)
Collecting sniffio
  Using cached sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting pydantic<3,>=1.9.0
  Using cached pydantic-2.11.4-py3-none-any.whl (443 kB)
Collecting distro<2,>=1.7.0
  Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.10.0-cp39-cp39-macosx_11_0_ar

In [2]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 539 kB/s eta 0:00:01
Collecting aiohttp
  Using cached aiohttp-3.11.18-cp39-cp39-macosx_11_0_arm64.whl (457 kB)
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.2-py2.py3-none-any.whl (7.6 kB)
Collecting yarl<2.0,>=1.17.0
  Using cached yarl-1.20.0-cp39-cp39-macosx_11_0_arm64.whl (95 kB)
Collecting propcache>=0.2.0
  Using cached propcache-0.3.1-cp39-cp39-macosx_11_0_arm64.whl (46 kB)
Collecting async-timeout<6.0,>=4.0
  Using cached async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Collecting aiohappyeyeballs>=2.3.0
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl (15 kB)
Collecting multidict<7.0,>=4.5
  Using cached multidict-6.4.4-cp39-cp39-macosx_11_0_arm64.whl (38 kB)
Collecting attrs>=17.3.0
  Using cached attrs-25.3.0-py3-none-any.whl (63 kB)
Collecting frozenlist>=1.1.1
  Using cached frozenlist-1.6.0-cp39-cp39-macosx_11_0_arm64.whl (122 kB)
In

In [3]:
!pip install --upgrade openai


Collecting openai
  Using cached openai-1.79.0-py3-none-any.whl (683 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.79.0
You should consider upgrading via the '/Users/mac/Documents/LMA-RAG Code/LMA-RAG Thesis Code/OpenAI FAQ/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [4]:
!python.exe -m pip install --upgrade pip

zsh:1: command not found: python.exe


In [6]:
import os
import re
from typing import List
import fitz  # PyMuPDF
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import ollama
from openai import OpenAI


  from .autonotebook import tqdm as notebook_tqdm


Process a single PDF file

In [7]:
def read_urdu_pdfs_from_folder(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            doc = fitz.open(file_path)
            for page in doc:
                text = page.get_text()
                all_text += text + "\n"
    return all_text


In [8]:
def split_into_sentences(text: str) -> List[str]:
    # Handles Urdu and English punctuation
    return re.split(r'(?<=[.!؟۔])\s+', text)

def split_into_chunks(text: str, chunk_size: int = 800) -> List[str]:
    sentences = split_into_sentences(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    if len(chunks) < 10:
        print("⚠️ Too few chunks generated; falling back to fixed-size splitting.")
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    return chunks


#### Function to generate Q&A pairs using OpenAI ####

In [9]:
os.environ["OPENAI_API_KEY"] = kkkk
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [10]:
import openai
import tiktoken

client = openai.OpenAI()

tokenizer = tiktoken.encoding_for_model("gpt-4o")

def count_tokens(text):
    return len(tokenizer.encode(text))

def generate_qa_pairs(text, chunk_index=None, verbose=False):
    base_prompt = f"""
    Read the following Urdu paragraph carefully. Then generate 5 to 10 question-answer pairs based *only* on the information in the paragraph.

    Instructions:
    - Output must be in **Urdu**.
    - Each question must start with "سوال:" and each answer with "جواب:".
    - Do **not** add information not present in the paragraph.
    - If the paragraph contains limited information, generate fewer questions.
    - Avoid any hallucination.

    ### Example:

    Paragraph:
    \"\"\"
    2005 کے زلزلے کے بعد حکومت نے متاثرہ علاقوں کا سروے کیا۔ کئی اسکول تباہ ہو گئے تھے۔ عوام نے الزام لگایا کہ عمارتیں ناقص مواد سے بنائی گئی تھیں۔ کچھ رپورٹس میں بتایا گیا کہ چند اسکولوں میں حفاظتی اقدامات مکمل نہیں تھے۔
    \"\"\"

    Output:

    سوال 1: 2005 کے زلزلے کے بعد حکومت نے کیا اقدام کیا؟
    جواب 1: حکومت نے متاثرہ علاقوں کا سروے کیا۔

    سوال 2: عوام نے عمارتوں کے بارے میں کیا الزام لگایا؟
    جواب 2: عوام نے الزام لگایا کہ عمارتیں ناقص مواد سے بنائی گئی تھیں۔

    سوال 3: رپورٹس میں کیا بات سامنے آئی؟
    جواب 3: رپورٹس میں بتایا گیا کہ چند اسکولوں میں حفاظتی اقدامات مکمل نہیں تھے۔

    ---

    Now apply the same approach to the following paragraph:

    Paragraph:
    \"\"\"
    {text}
    \"\"\"

    Now generate the questions and answers in Urdu:
    """


    prompt_tokens = count_tokens(base_prompt)
    max_total = 128000
    max_completion = 1500

    if prompt_tokens + max_completion > max_total:
        if verbose:
            print(f"⚠️ Skipping chunk {chunk_index} — exceeds token limit ({prompt_tokens})")
        return ""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that generates question-answer pairs based on Urdu text."},
                {"role": "user", "content": base_prompt}
            ],
            max_tokens=max_completion,
            temperature=0.5
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"⚠️ Error generating Q&A for chunk {chunk_index}: {e}")
        return ""


Parse Q&As

In [11]:
def parse_qa_output(output_text):
    qa_pairs = []
    lines = output_text.strip().split('\n')

    question, answer = "", ""
    for line in lines:
        if line.strip().startswith("سوال"):
            if question and answer:
                qa_pairs.append((question.strip(), answer.strip()))
                answer = ""
            question = line.split(":", 1)[-1].strip()
        elif line.strip().startswith("جواب"):
            answer = line.split(":", 1)[-1].strip()

    if question and answer:
        qa_pairs.append((question.strip(), answer.strip()))

    return qa_pairs


Processing Pipeline

In [12]:
def process_pdfs_and_generate_qa(folder_path, output_csv_path):
    full_text = read_urdu_pdfs_from_folder(folder_path)
    chunks = split_into_chunks(full_text)

    all_qa_pairs = []

    for i, chunk in enumerate(chunks):
        print(f"\n🔹 Processing chunk {i+1}/{len(chunks)}")
        print(f"🧩 Chunk content:\n{chunk[:800]}...\n")  # Print preview

        try:
            output = generate_qa_pairs(chunk)
            print(f"📥 Model output:\n{output}\n")

            qa_pairs = parse_qa_output(output)
            for q, a in qa_pairs:
                all_qa_pairs.append({'chunk': i+1, 'question': q, 'answer': a})
        except Exception as e:
            print(f"❌ Error processing chunk {i+1}: {e}")

    df = pd.DataFrame(all_qa_pairs)
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    print(f"\n✅ All Q&A pairs saved to: {output_csv_path}")


 Main script

In [13]:
pdf_folder_path = 'urdu_pdfs' 
output_csv = 'urdu_qa_pairs_updated_openai_4o.csv'

process_pdfs_and_generate_qa(pdf_folder_path, output_csv)



🔹 Processing chunk 1/274
🧩 Chunk content:
29
 ﻣﺋﯽ 2008
 ﮐو ، ﺳرﮐﺎری ﻋﮩدﯾداروں ﻧﮯ ﮨزاروں ﮔرے ﮨوﺋﮯ اﺳﮑوﻟوں ﮐﮯ ﮐﮭﻧڈرات ﮐﺎ ﻣﻌﺎﺋﻧہ ﮐرﻧﺎ ﺷروع ﮐﯾﺎ ، اس 
ﺑﺎرے ﻣﯾں اﺷﺎرے ﺗﻼش ﮐرﻧﮯ ﮐﮯ ﻟﺋﮯ ﮐہ وه ﮐﯾوں ﮔر ﮔﺋﮯ۔ ﺻوﺑﮯ ﮐﮯ آس ﭘﺎس ﮨزاروں واﻟدﯾن ﻧﮯ ﻣﻘﺎﻣﯽ ﻋﮩدﯾداروں اور 
ﺑﻠڈرز ﭘر اﺳﮑول ﮐﯽ ﺗﻌﻣﯾر ﻣﯾں ﮐوﻧﮯ ﮐوﻧﮯ ﮐﺎﭨﻧﮯ ﮐﺎ اﻟزام ﻋﺎﺋد ﮐﯾﺎ ﮨﮯ ، اس ﮐﺎ ﺣواﻟہ دﯾﺗﮯ ﮨوﺋﮯ ﮐہ زﻟزﻟﮯ ﮐﮯ ﺑﻌد ﻗرﯾﺑﯽ 
دﯾﮕر ﻋﻣﺎرﺗوں ﮐو ﺑﮩت ﮐم ﻧﻘﺻﺎن ﭘﮩﻧﭼﺎ ﺗﮭﺎ۔ زﻟزﻟﮯ ﮐﮯ ﺑﻌد ، ﺑﮩت ﺳﯽ ﻣﻘﺎﻣﯽ ﺣﮑوﻣﺗوں ﻧﮯ ﺳرﮐﺎری طور ﭘر اﺳﮑول ﮐﮯ 
ﮔرﻧﮯ ﮐﯽ ﺗﺣﻘﯾﻘﺎت ﮐرﻧﮯ ﮐﺎ وﻋده ﮐﯾﺎ ، ﻟﯾﮑن 17
 ﺟوﻻﺋﯽ 2008
 ﺗﮏ ﭘورے ﺳﯾﭼوان ﻣﯾں ، ﮔرے ﮨوﺋﮯ اﺳﮑوﻟوں ﻣﯾں ﮔﻣﺷده 
ﺑﭼوں ﮐﮯ واﻟدﯾن ﻧﮯ ﺷﮑﺎﯾت ﮐﯽ ﮐہ اﻧﮩﯾں اﺑﮭﯽ ﺗﮏ ﮐوﺋﯽ رﭘورٹ ﻣوﺻول ﻧﮩﯾں ﮨوﺋﯽ۔ ﻣﻘﺎﻣﯽ ﻋﮩدﯾداروں ﻧﮯ اﻧﮩﯾں اﺣﺗﺟﺎج ﻧہ 
ﮐرﻧﮯ ﮐﯽ ﺗﺎﮐﯾد ﮐﯽ ﻟﯾﮑن واﻟدﯾن ﻧﮯ ﻣظﺎﮨره ﮐﯾﺎ اور ﺗﺣﻘﯾﻘﺎت ﮐﺎ ﻣطﺎﻟﺑہ ﮐﯾﺎ۔...

📥 Model output:
سوال 1: 29 مئی 2008 کو سرکاری عہدیداروں نے کیا اقدام کیا؟
جواب 1: سرکاری عہدیداروں نے ہزاروں گرے ہوئے اسکولوں کے کھنڈرات کا معائنہ کرنا شروع کیا۔

سوال 2: والدین نے مقامی عہدیداروں اور بلڈرز پر کیا الزام لگایا؟
جواب 2: والدین نے الز