In [None]:
!pip install pytesseract pdf2image PyPDF2 faiss-cpu numpy openai sentence-transformers langchain

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from 

In [None]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (297 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-ben

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Waiting for headers] [1 InRelease 0 B/3,632 B 0%] [Wa0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,853 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http:/

In [None]:
from google.colab import files
uploaded = files.upload()  # Choose your_file.pdf

Saving HSC26-Bangla1st-Paper.pdf to HSC26-Bangla1st-Paper.pdf


In [None]:
import os
import time
import re
import numpy as np
import faiss
import pytesseract
from pdf2image import convert_from_path
from openai import OpenAI

# === Step 1: Extract Bangla text from a PDF using OCR ===
def extract_text_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        raw = pytesseract.image_to_string(img, lang='ben', config='--psm 6')
        text += raw.replace('\x0c', '').strip() + "\n"
    return text

# === Step 2: Split long OCR text into manageable overlapping chunks ===
def split_into_chunks(text, chunk_size=1000, chunk_overlap=500):
    sentences = re.split(r'(।|\?|!|\n)', text)
    merged = []
    i = 0
    while i < len(sentences) - 1:
        s = sentences[i].strip()
        if i + 1 < len(sentences):
            s += sentences[i + 1].strip()
        if s:
            merged.append(s)
        i += 2

    chunks, curr = [], ""
    for s in merged:
        if len(curr) + len(s) <= chunk_size:
            curr += s + " "
        else:
            chunks.append(curr.strip())
            curr = curr[-chunk_overlap:] + s + " "
    if curr:
        chunks.append(curr.strip())
    return chunks

# === Step 3: Extract multiple-choice question answers from inline and tabular patterns ===
def extract_mcq_answers(text):
    inline_answers = re.findall(r'(\d+)[.|।] .*?[উত্তর|উ:]*[:：\-\s]+([কখগঘ])', text)
    table_answers = re.findall(r'(\d+)\s*\|\s*([কখগঘ])', text)
    all_answers = {int(num): opt for num, opt in inline_answers + table_answers}
    return all_answers

# === Step 4: Given a question's options and correct letter, return the full answer text ===
def resolve_answer_text(question, options, answer_key):
    options_map = {'ক': 0, 'খ': 1, 'গ': 2, 'ঘ': 3}
    try:
        return options[options_map[answer_key]].strip()
    except:
        return "উত্তর খুঁজে পাওয়া যায়নি।"

# === Step 5: Generate vector embeddings for each text chunk using OpenAI API ===
def embed_chunks_openai(chunks, openai_api_key, model_name="text-embedding-3-small"):
    client = OpenAI(api_key=openai_api_key)
    embeddings = []
    for i, chunk in enumerate(chunks):
        try:
            res = client.embeddings.create(input=chunk, model=model_name)
            embeddings.append(res.data[0].embedding)
            time.sleep(0.1)
        except Exception as e:
            print(f"Embedding error for chunk {i}: {e}")
            embeddings.append([0.0] * 1536)
    return embeddings, model_name

# === Step 6: Build FAISS index for efficient similarity-based retrieval ===
def build_faiss_index(embeddings, metric='l2'):
    dim = len(embeddings[0])
    data = np.array(embeddings).astype('float32')

    if metric == 'l2':
        index = faiss.IndexFlatL2(dim)
    elif metric == 'dot':
        index = faiss.IndexFlatIP(dim)
    elif metric == 'cosine':
        faiss.normalize_L2(data)
        index = faiss.IndexFlatIP(dim)
    else:
        raise ValueError("Unknown metric.")

    index.add(data)
    return index

# === Step 7: Generate query embedding ===
def get_embedding(query, api_key, model):
    client = OpenAI(api_key=api_key)
    res = client.embeddings.create(input=query, model=model)
    return np.array(res.data[0].embedding, dtype=np.float32)



In [None]:
# === Step 8: Retrieve top-k most similar chunks from FAISS index ===
def retrieve(query, index, chunks, api_key, model, k=5, metric='l2'):
    q = get_embedding(query, api_key, model)
    if metric == 'cosine':
        faiss.normalize_L2(q.reshape(1, -1))
    D, I = index.search(np.array([q]), k)
    return [chunks[i] for i in I[0]], D[0]

# === Step 9: Generate answer using OpenAI LLM based on context chunks ===
def generate_answer(context_chunks, query, api_key):
    if not context_chunks:
        return "প্রাসঙ্গিক কোনো তথ্য পাওয়া যায়নি।"

    context = "\n".join(context_chunks)
    client = OpenAI(api_key=api_key)
    res = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": '''You are a helpful assistant for solving Bangla textbook MCQs. Given:

- A user-submitted question in Bangla
- A set of retrieved MCQs with options and answers from textbook pages

You must find the best matching MCQ from the context and return only the correct full answer text (e.g., "দুইটি").

Step-by-step reasoning:
1. Match user query to most relevant question in the retrieved context.
2. Identify its answer key either from inline format (উত্তর: গ) or from tabular form (e.g., 6 | গ).
3. Find the correct option using the key and return its associated answer text only.

Constraints:
- Do not guess or infer from external knowledge.
- Return only the answer text, not the letter or explanation.
- Maintain Bangla output format.'''},
            {"role": "user", "content": f"প্রসঙ্গ:\n{context}\n\nপ্রশ্ন: {query}"}
        ]
    )
    return res.choices[0].message.content.strip()

# === Step 10: Initialize all processing (OCR, chunking, embeddings, indices, MCQ answers) ===
def initialize_rag(pdf_path, api_key):
    print("Extracting text using OCR...")
    text = extract_text_from_pdf(pdf_path)
    print("Splitting text into chunks...")
    chunks = split_into_chunks(text)
    print(f"Prepared {len(chunks)} chunks.")
    print("Generating embeddings for chunks...")
    embeddings, model = embed_chunks_openai(chunks, api_key)

    print("Building FAISS indices...")
    idx_l2 = build_faiss_index(embeddings, 'l2')
    idx_dot = build_faiss_index(embeddings, 'dot')
    idx_cos = build_faiss_index(embeddings, 'cosine')

    mcq_answers = extract_mcq_answers(text)
    return chunks, model, {"L2": idx_l2, "Dot": idx_dot, "Cosine": idx_cos}, mcq_answers, text

# === Step 11: Match a user query with existing MCQ patterns ===
def find_best_mcq_match(query, text):
    pattern = re.compile(r'(\d+)[.|।]\s*(.*?)\((ক\)|খ\)|গ\)|ঘ\)).*?উত্তর[:：\-\s]*([কখগঘ])', re.DOTALL)
    matches = pattern.findall(text)
    for sl, question, _, correct_option in matches:
        if query.strip()[:20] in question.strip():
            return int(sl), correct_option
    return None, None

# === Step 12: Query interaction loop: try table match, regex fallback, and finally retrieval ===
def comparative_query_loop(chunks, indices, model, api_key, mcq_answers, full_text):
    print("প্রশ্ন লিখুন (বন্ধ করতে 'exit' লিখুন):")
    while True:
        query = input("প্রশ্ন: ").strip()
        if query.lower() in ['exit', 'quit', 'q']:
            print("বিদায়!")
            break

        match = re.search(r'\d+', query)
        if match:
            q_num = int(match.group())
            if q_num in mcq_answers:
                ans_letter = mcq_answers[q_num]
                print(f"উত্তর (টেবিল থেকে পাওয়া): প্রশ্ন {q_num} এর উত্তর: {ans_letter}")
                print("-" * 40)
                continue

        sl, answer_letter = find_best_mcq_match(query, full_text)
        if sl and answer_letter:
            print(f"উত্তর (নমুনা মিল থেকে পাওয়া): প্রশ্ন {sl} এর উত্তর: {answer_letter}")
            print("-" * 40)
            continue

        for name, index in indices.items():
            print(f"Retrieving with {name} index")
            results, scores = retrieve(query, index, chunks, api_key, model, k=5, metric=name.lower())
            print(f"Top similarity score: {scores[0]:.4f}")
            answer = generate_answer(results, query, api_key)
            print(f"উত্তর ({name} retrieval): {answer}")
            print("-" * 40)




In [None]:
# === Execution ===
PDF_PATH = "HSC26-Bangla1st-Paper.pdf"

# ✅ Assign the OpenAI API key directly (for testing only — do NOT hardcode in production)
OPENAI_API_KEY = sk-......

# ✅ Call your pipeline functions
chunks, emb_model, all_indices, mcq_answers, full_text = initialize_rag(PDF_PATH, OPENAI_API_KEY)
comparative_query_loop(chunks, all_indices, emb_model, OPENAI_API_KEY, mcq_answers, full_text)


Extracting text using OCR...
Splitting text into chunks...
Prepared 174 chunks.
Generating embeddings for chunks...
Building FAISS indices...
প্রশ্ন লিখুন (বন্ধ করতে 'exit' লিখুন):
প্রশ্ন: অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?
Retrieving with L2 index
Top similarity score: 1.1656
উত্তর (L2 retrieval): শস্তুনাথ
----------------------------------------
Retrieving with Dot index
Top similarity score: 0.4172
উত্তর (Dot retrieval): শস্তুনাথ বাবু
----------------------------------------
Retrieving with Cosine index
Top similarity score: 0.4172
উত্তর (Cosine retrieval): শস্তুনাথ
----------------------------------------
প্রশ্ন: কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?
Retrieving with L2 index
Top similarity score: 1.0670
উত্তর (L2 retrieval): বিনু।
----------------------------------------
Retrieving with Dot index
Top similarity score: 0.4699
উত্তর (Dot retrieval): বিনু।
----------------------------------------
Retrieving with Cosine index
Top similarity score: 0.4699
উত্তর (Cosine re