# Create a Question Answering model based on Textbook


### step 1: Extract text and images page wise


In [1]:
import fitz  # PyMuPDF
from PIL import Image
import os
import io
import numpy as np

In [2]:
def is_blank_image(pil_img, threshold=10):
    """Returns True if the image is almost blank (white or very low variance)."""
    gray_img = pil_img.convert('L')
    np_img = np.array(gray_img)
    stddev = np.std(np_img)
    return stddev < threshold

In [3]:
def extract_text_and_images(pdf_path, output_dir):
    """Extract text and images
    """
    os.makedirs(output_dir, exist_ok=True)
    text_dir = os.path.join(output_dir, "text")
    image_dir = os.path.join(output_dir, "images")
    os.makedirs(text_dir, exist_ok=True)
    os.makedirs(image_dir, exist_ok=True)

    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        # Save text
        text = page.get_text()
        with open(os.path.join(text_dir, f"page_{page_num+1}.txt"), "w", encoding="utf-8") as f:
            f.write(text)

        # Save non-blank images
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))

            if not is_blank_image(image):
                image.save(os.path.join(image_dir, f"page_{page_num+1}_img_{img_index+1}.{img_ext}"))
        
        
    print(f"Extraction complete. Text in '{text_dir}', Images in '{image_dir}'")

In [4]:
extract_text_and_images("data/ncert/6/social/sixth_social_chapter1.pdf", "data/ncert/6/social/extracted")

MuPDF error: library error: FT_New_Memory_Face(LNWSSX+NotoSerif-Italic): unknown file format

Extraction complete. Text in 'data/ncert/6/social/extracted\text', Images in 'data/ncert/6/social/extracted\images'


In [5]:
# Lets starting chunking text
import os
import re
import glob
import json

In [6]:
def chunk_text(text, max_length=500, overlap=50):
    chunks = []
    start = 0
    text_length = len(text)
    while start < text_length:
        end = min(start + max_length, text_length)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += max_length - overlap
    return chunks

In [7]:
def extract_page_number(filename):
    """
    Extract the page number from a filename formatted as 'page_{page_number}.txt'
    """
    base = os.path.basename(filename)
    match = re.match(r"page_(\d+)", base)
    if match:
        return int(match.group(1))
    return None

In [85]:
# Lets perform embedding and store the metadata and embedding in a json file

In [8]:
def process_text_files_with_metadata(text_folder, model, output_file="text_chunks_with_embeddings.json"):
    data = []  # List to hold the chunks with metadata
    
    # Process all text files in the folder
    txt_files = glob.glob(os.path.join(text_folder, "*.txt"))
    for txt_file in txt_files:
        # Extract the page number from the file name
        page_number = extract_page_number(txt_file)
        
        with open(txt_file, "r", encoding="utf-8") as f:
            text = f.read()
        
        # Chunk the text from the file
        chunks = chunk_text(text, max_length=500, overlap=50)
        
        # Compute embeddings for each chunk
        embeddings = model.encode(chunks, show_progress_bar=True).tolist()
        
        # Add each chunk with its metadata to the data list
        for idx, chunk in enumerate(chunks):
            data.append({
                "source_file": os.path.basename(txt_file),
                "page_number": page_number,
                "chunk_id": idx,
                "text_chunk": chunk,
                "embedding": embeddings[idx]
            })
    
    # Save the output to a JSON file
    with open(output_file, "w", encoding="utf-8") as out_f:
        json.dump(data, out_f, indent=2)
    
    print(f"Processed {len(txt_files)} files. Data saved to '{output_file}'")

In [9]:
# Lets use model to compute embedding
from sentence_transformers import SentenceTransformer
model_text = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
process_text_files_with_metadata("data/ncert/6/social/extracted/text",model_text)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 26 files. Data saved to 'text_chunks_with_embeddings.json'


In [11]:
# Now lets start processing images

import os
import glob
import json
import re
from PIL import Image

In [12]:
# extract page and index number for adding to metadata
def extract_page_and_index(filename):
    """
    Parses the filename to extract page number and image index.
    
    Expected filename format: page_{page_number}_img_{image_index}.{ext}
    
    Returns:
      (page_number, image_index) if available, otherwise (None, None).
    """
    base = os.path.basename(filename)
    # Use regex to extract page and image numbers
    match = re.match(r"page_(\d+)_img_(\d+)", base)
    if match:
        page_number = int(match.group(1))
        image_index = int(match.group(2))
        return page_number, image_index
    return None, None

In [13]:
def process_images_with_metadata(image_folder, model, output_file="image_embeddings_with_metadata.json"):
    """
    Processes images by:
      - Loading each image.
      - Computing its embedding using the provided model.
      - Extracting metadata (page number and image index) from the filename.
      - Saving the results in a JSON file.
    
    Args:
      image_folder (str): Folder containing the images.
      model: A SentenceTransformer model (e.g., CLIP-based) for image embeddings.
      output_file (str): Filename for saving the JSON output.
    """
    data = []
    # Adjust file extensions as needed
    image_files = glob.glob(os.path.join(image_folder, "*.*"))
    
    for image_file in image_files:
        try:
            # Open image and convert to RGB (required for models like CLIP)
            img = Image.open(image_file).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_file}: {e}")
            continue
        
        # Generate image embedding using the model (expects list input)
        embedding = model.encode([img], convert_to_tensor=False)[0].tolist()
        
        # Extract page and image index from filename (if available)
        page_number, image_index = extract_page_and_index(image_file)
        
        # Create the metadata dictionary
        meta = {
            "image_file": os.path.basename(image_file),
            "embedding": embedding,
            "page_number": page_number,
            "image_index": image_index
        }
        
        data.append(meta)
    
    # Save the enriched image metadata to a JSON file
    with open(output_file, "w", encoding="utf-8") as out_f:
        json.dump(data, out_f, indent=2)
    
    print(f"Processed {len(image_files)} images and saved metadata to '{output_file}'")

In [14]:
model_image = SentenceTransformer('clip-ViT-B-32')

modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

0_CLIPModel/pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [15]:
process_images_with_metadata("data/ncert/6/social/extracted/images", model_image)

Processed 22 images and saved metadata to 'image_embeddings_with_metadata.json'


In [16]:
# Now lets link image and text
def link_text_and_images(text_json_file, image_json_file):
    """
    Loads text chunk and image embedding data from JSON files,
    groups them by page_number, and returns a combined dictionary.
    """
    with open(text_json_file, "r", encoding="utf-8") as f:
        text_data = json.load(f)
    with open(image_json_file, "r", encoding="utf-8") as f:
        image_data = json.load(f)
    
    # Group text chunks by page number
    text_by_page = {}
    for item in text_data:
        page = item.get("page_number")
        if page is not None:
            text_by_page.setdefault(page, []).append(item)
    
    # Group images by page number
    images_by_page = {}
    for item in image_data:
        page = item.get("page_number")
        if page is not None:
            images_by_page.setdefault(page, []).append(item)
    
    # Combine the groups: For every page, include both text chunks and images.
    linked_data = {}
    all_pages = set(text_by_page.keys()).union(images_by_page.keys())
    for page in all_pages:
        linked_data[page] = {
            "text_chunks": text_by_page.get(page, []),
            "images": images_by_page.get(page, [])
        }
    
    return linked_data

In [17]:
linked = link_text_and_images("text_chunks_with_embeddings.json", "image_embeddings_with_metadata.json")
print(json.dumps(linked, indent=2))

{
  "1": {
    "text_chunks": [
      {
        "source_file": "page_1.txt",
        "page_number": 1,
        "chunk_id": 0,
        "text_chunk": "1\nIntroduction \u2013 Why Social Science?\nIntroduction\nWhy Social Science?\nFamily \nand \nCommunity\nGovernance\nLocal Government \nin Rural Areas\nSocial \nScience\nThe Value \nof Work\nLocal Government \nin Urban Areas\nLocating \nPlaces on the \nEarth\nOceans and \nContinents\nTHEME E\nEconomic  \nLife Around  \nUs\nThe Beginnings of \nIndian Civilisation\nTHEME B\nTapestry \nof the Past\nEconomic \nActivities \nAround Us\nUnity in \nDiversity, \nor \u2018Many in \nthe One\u2019\nTHEME C\nOur Cultural \nHeritage and \nKnowledge \nTraditions\nLa",
        "embedding": [
          0.023390542715787888,
          0.006278308108448982,
          -0.06834099441766739,
          0.06300639361143112,
          0.033190466463565826,
          0.025606531649827957,
          -0.024194134399294853,
          -0.07373285293579102,
          -0

# Vector Database

In [19]:
import faiss
import json
import numpy as np

In [20]:
def build_faiss_index(json_file, embedding_field="embedding"):
    """
    Loads data from a JSON file where each item includes an embedding,
    builds a FAISS index, and returns both the index and the loaded data.

    Args:
      json_file (str): Path to the JSON file.
      embedding_field (str): The field key in the JSON items that contains the embedding.

    Returns:
      index (faiss.Index): The FAISS index.
      data (list): The JSON data as a list of dictionaries.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Extract embeddings from each item
    embeddings = [item[embedding_field] for item in data]
    embeddings_np = np.array(embeddings).astype("float32")
    
    # Determine the dimension of the embeddings
    dimension = embeddings_np.shape[1]
    
    # Create a flat (brute-force) L2 distance index
    index = faiss.IndexFlatL2(dimension)
    
    # Add all embeddings into the index
    index.add(embeddings_np)
    
    print(f"✅ Indexed {index.ntotal} text items with dimension {dimension}")
    return index, data

In [21]:
def build_faiss_index_for_images(json_file, embedding_field="embedding"):
    """
    Loads image data from a JSON file and builds a FAISS index for image embeddings.
    
    Args:
      json_file (str): Path to the JSON file containing image embeddings.
      embedding_field (str): The key that contains the embedding in each JSON object.
      
    Returns:
      index (faiss.Index): The FAISS index.
      data (list): The JSON data as a list of dictionaries.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    embeddings = [item[embedding_field] for item in data]
    embeddings_np = np.array(embeddings).astype("float32")
    dimension = embeddings_np.shape[1]
    
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)
    
    print(f"✅ Indexed {index.ntotal} image items with dimension {dimension}")
    return index, data

In [22]:
def search_text_index(query, index, metadata, model, top_k=5):
    """
    Searches the FAISS text index using a query string.
    
    Args:
      query (str): The user query.
      index (faiss.Index): The FAISS index for text embeddings.
      metadata (list): The list of metadata corresponding to the indexed items.
      model: The embedding model (SentenceTransformer) used to encode the query.
      top_k (int): The number of results to return.
      
    Returns:
      results (list): Top matching metadata items.
    """
    # Compute the query embedding
    query_embedding = model.encode([query]).astype("float32")
    
    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)
    
    results = []
    for i in indices[0]:
        # Verify index validity and append corresponding metadata
        if i < len(metadata):
            results.append(metadata[i])
    return results

In [23]:
# Build the FAISS index for text embeddings.
text_index, text_metadata = build_faiss_index("text_chunks_with_embeddings.json")

✅ Indexed 83 text items with dimension 384


In [24]:
# Build the FAISS index for image embeddings.
image_index, image_metadata = build_faiss_index_for_images("image_embeddings_with_metadata.json")

✅ Indexed 22 image items with dimension 512


In [26]:
# Example usage (assuming model_text from Step 2 is already loaded):
query = "What is the Latitude ?"
results = search_text_index(query, text_index, text_metadata, model_text)
print("Top text results:")
for res in results:
    print(f"Page: {res.get('page_number')} | Chunk ID: {res.get('chunk_id')}")
    print(res.get("text_chunk")[:150] + "...\n")

Top text results:
Page: 16 | Chunk ID: 2
f the distance from the 
Equator if you travel towards one of the poles, longitude is 
a measure of the distance from the Prime Meridian if you 
trave...

Page: 20 | Chunk ID: 2
meridian.   
15°
9 AM
45° W
30° W
15° W
0°
15° E
30°E
45° E
10 AM
11 AM
12 PM
1 PM
2 PM
3 PM
Reprint 2025-26...

Page: 16 | Chunk ID: 1
easuring the time at that place. Let us see how. 
To measure longitudes, we need to define a reference point 
called the Prime Meridian (Fig. 1.3 on p...

Page: 16 | Chunk ID: 4
f latitude and the 
meridians of longitude together on the globe as blue lines. 
Reprint 2025-26...

Page: 15 | Chunk ID: 0
15
1 – Locating Places on the Earth
also why we experience different seasons in the course of 
a year.
 Fig. 1.3. This globe shows both parallels of l...



In [27]:
def link_text_with_images(text_result, image_metadata):
    """
    For a given text result, find all images on the same page.
    
    Args:
      text_result (dict): A single text chunk metadata item.
      image_metadata (list): List of image metadata items.
      
    Returns:
      List of image metadata items that have the same page_number.
    """
    page = text_result.get("page_number")
    linked_images = [img for img in image_metadata if img.get("page_number") == page]
    return linked_images

In [28]:
# For the first text result, get linked images:
if results:
    first_result = results[0]
    linked_images = link_text_with_images(first_result, image_metadata)
    print(f"Linked images on page {first_result.get('page_number')}:")
    for img in linked_images:
        print(img.get("image_file"))

Linked images on page 16:
page_16_img_1.png


In [29]:
from transformers import pipeline

In [30]:
# Initialize the text generation pipeline (adjust max_length as needed)
generator = pipeline("text-generation", model="gpt2", max_length=500)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [31]:
def load_text_model():
    return SentenceTransformer('all-MiniLM-L6-v2')

In [32]:
model_text = load_text_model()

In [33]:
def load_json(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        return json.load(f)

In [34]:
text_json_path = "text_chunks_with_embeddings.json"
text_data = load_json(text_json_path)

In [35]:
image_json_path = "image_embeddings_with_metadata.json"
image_data = load_json(image_json_path)

In [36]:
query = "What is the Latitude ?"
#input("Enter your query ?: ")


In [37]:
results = search_text_index(query, text_index, text_data, model_text, top_k=2)

In [38]:
def aggregate_context(results, image_data):
    context_text = ""
    context_images = ""
    for result in results:
        page = result.get("page_number")
        context_text += f"Page {page} - {result.get('text_chunk')}\n\n"
        linked_images = [img for img in image_data if img.get("page_number") == page]
        if linked_images:
            # For now, use image filenames as reference; later, replace with generated captions.
            context_images += f"Page {page} Images: " + ", ".join([img.get("image_file") for img in linked_images]) + "\n\n"
    return context_text.strip(), context_images.strip()

In [39]:
# Aggregate context from results
text_context, image_context = aggregate_context(results, image_data)

In [40]:
text_context

'Page 16 - f the distance from the \nEquator if you travel towards one of the poles, longitude is \na measure of the distance from the Prime Meridian if you \ntravel along the Equator. Longitude, too, is measured in \ndegrees. Westward or eastward, it increases in value from \n0° to 180°, with the letter ‘W’ or ‘E’ added. For instance, \nusing round figures, New York’s longitude is 74°W, while \nDelhi’s is 77°E and Tokyo’s is 140°E.\nDON’T MISS OUT\nAs you can see on the globe of meridians of longitudes, 180°W \nand\n\nPage 20 - meridian.   \n15°\n9 AM\n45° W\n30° W\n15° W\n0°\n15° E\n30°E\n45° E\n10 AM\n11 AM\n12 PM\n1 PM\n2 PM\n3 PM\nReprint 2025-26'

In [41]:
image_context

'Page 16 Images: page_16_img_1.png'

In [42]:
def construct_prompt(query, text_context, image_context):
    prompt = f"""
You are an expert tutor. Use the following textbook context to answer the query thoroughly.

Text Context:
{text_context}

Image Context:
{image_context}

Question: {query}

Answer:
    """
    return prompt

In [43]:
prompt = construct_prompt(query, text_context, image_context)

In [44]:
generator = pipeline("text-generation", model="gpt2", max_length=500)

Device set to use cpu


In [45]:
def generate_answer(prompt):
    response = generator(prompt, max_length=500, do_sample=True, temperature=0.1)
    # Extract the generated text from the response.
    return response[0]['generated_text']

In [46]:
answer = generate_answer(prompt)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [47]:
print(answer)


You are an expert tutor. Use the following textbook context to answer the query thoroughly.

Text Context:
Page 16 - f the distance from the 
Equator if you travel towards one of the poles, longitude is 
a measure of the distance from the Prime Meridian if you 
travel along the Equator. Longitude, too, is measured in 
degrees. Westward or eastward, it increases in value from 
0° to 180°, with the letter ‘W’ or ‘E’ added. For instance, 
using round figures, New York’s longitude is 74°W, while 
Delhi’s is 77°E and Tokyo’s is 140°E.
DON’T MISS OUT
As you can see on the globe of meridians of longitudes, 180°W 
and

Page 20 - meridian.   
15°
9 AM
45° W
30° W
15° W
0°
15° E
30°E
45° E
10 AM
11 AM
12 PM
1 PM
2 PM
3 PM
Reprint 2025-26

Image Context:
Page 16 Images: page_16_img_1.png

Question: What is the Latitude ?

Answer:
                                  

The Latitude is the distance from the 

Prime Meridian to the 

Equator. Longitude, too, is measured in 

degrees. Westward or eastw