In [2]:
import os
from tqdm import tqdm
from pdf2image import convert_from_path
from PIL import Image
import google.generativeai as genai
import json
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


True

In [10]:
# Cấu hình đường dẫn
PDF_DIR = "/home/buma04/ai-questgen/data/pdf_files/sgk/"  # thư mục chứa file PDF
OUTPUT_IMG_DIR = "/home/buma04/ai-questgen/data/images_files/sgk/"  # thư mục lưu ảnh output
RESULT_DIR = "/home/buma04/ai-questgen/data/json_format"  # thư mục lưu kết quả JSON

# Tạo các thư mục nếu chưa tồn tại
for dir_path in [PDF_DIR, OUTPUT_IMG_DIR, RESULT_DIR]:
    os.makedirs(dir_path, exist_ok=True)

# Khởi tạo Gemini
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')

In [4]:
system_instruction_prompt ="""
You are an expert in extracting structured data from book page images and converting it into a machine-readable JSON format optimized for retrieval-augmented generation (RAG).  

# **Task Description**  
Extract structured text from a scanned book page while maintaining logical organization, readability, and chunking optimized for vector search. Ensure each chunk contains a complete and meaningful unit of information.  

# **Extraction Guidelines**
  
## 1 **Preserve Text Structure & Chunking**  
- Extract all visible text while maintaining **headings, subheadings, paragraphs, and dialogues**.  
- **Chunk text logically**:  
  - Each chunk must be a self-contained unit (e.g., a paragraph, a list, or a dialogue turn).  
  - Avoid overly large chunks that contain multiple ideas.  
  - Preserve the **natural reading flow**.  
  - Group all contents in structured elements (tables, side notes, boxes, examples, etc.) into a single chunk, preserve the structure as text while maintaining column and row alignment. The text should reflect the table's structure, with rows separated by newline (`\n`) and columns separated by a delimiter (e.g., tab `\t` or pipe `|`).
  - If a task instruction (exercise prompt) directly refers to a text, keep both in the same chunk.

## 2 **Identify & Classify Page Type**  
- `cover_page`: Only contains the book title.  
- `table_of_contents`: Contains unit titles and corresponding page numbers.  
- `unit_start`: Introduces a new unit (extract `unit_title`).  
- `unit_content`: Belongs to a unit but does not introduce it (exclude `unit_title`). 

## 3 **Section Title Assignment**  
- Each unit follows a fixed structure:  
  `["I. GETTING STARTED", "II. LANGUAGE", "III. READING", "IV. SPEAKING", "V. LISTENING", "VI. WRITING", "VII. COMMUNICATION AND CULTURE/CLIL", "VIII. LOOKING BACK", "PROJECT"]`.  
- If a page does not explicitly contain a section title:  
  - **Inherit** `previous_section_title` as `section_title`.  
  - **Infer** the most relevant title based on context.  

## 4 **Chunking Rules & Metadata**  
Each extracted unit of text (chunk) must be stored as:  
```json
{
    "id": "Unique identifier for each chunk",
    "unit": "Unit title",
    "section": "Section title",
    "type": "text | dialogue | image | list",
    "content": "Extracted text (escaped properly for JSON)",
    "metadata": {
        "page": "Page number",
        "chunk_type": "overview | paragraph | dialogue | list",
        "speaker": "Speaker name (if applicable)",
        "related_chunks": "Comma-seperated string of related chunk IDs"
    }
}
```

## 5 **Dialogue Formatting**  
- Extract full dialogues as a single chunk instead of splitting into individual turns.
- Include all speaker names and their lines in the same chunk, formatted clearly for readability.
```json
{
    "type": "dialogue",
    "speaker": "Mark",
    "content": "Mark: Hi, Nam. Your book must be very interesting. What are you reading?\nNam: I’m reading a book called *The Diary of Dang Thuy Tram*.\nMark: Dang Thuy Tram? Who is she?\nNam: She was born in Hue in 1942.",
    "metadata": {
        "page": 8,
        "chunk_type": "dialogue",
        "related_chunks": "unit_1_page_8_3"
    }
}
```
## 6 **Handling Related Chunks**  
Each chunk should include "related_chunks" in metadata to ensure proper retrieval in multi-turn queries:

- For dialogues, store the entire conversation as a single chunk.
- Only link related non-dialogue chunks that provide background information.
- For paragraphs, related chunks should:
  - Reference other chunks discussing the same topic or entity.
  - Link supporting details to summary sentences.

## 7 **Handling Special Text Features**  
- Lists, bullet points, and bold text must be retained.
- Newline (\n) and tab (\t) characters must be escaped (\\n, \\t).
- Images or captions should be extracted with a description.

## 8 **Language & Multilingual Support**  
Maintain the original English and Vietnamese text without translation.

# **JSON Output Structure**
[
  {
    "id": "unit_1_page_8_1",
    "unit": "Life stories we admire",
    "section": "Getting Started",
    "type": "text",
    "content": "This unit includes: LANGUAGE - Pronunciation: Diphthongs /eɪ/ and /aʊ/, Vocabulary: Phrases related to life stories...",
    "metadata": {
      "page": 8,
      "chunk_type": "overview"
    }
  },
  {
    "id": "unit_1_page_8_5",
    "unit": "Life stories we admire",
    "section": "Getting Started",
    "type": "dialogue",
    "content": "Mark: Hi, Nam. Your book must be very interesting. What are you reading?\nNam: I’m reading a book called *The Diary of Dang Thuy Tram*.\nMark: Dang Thuy Tram? Who is she?\nNam: She was born in Hue in 1942.",
    "metadata": {
      "page": "8",
      "chunk_type": "dialogue"
    }
  }
]

Additional Requirements
- Accurate text extraction (English & Vietnamese).
- Chunking follows a logical, retrievable structure.
- No summarization or paraphrasing—text must be extracted exactly as shown.
- Consistent metadata tagging to improve searchability.
- Maintain reading order and logical flow.

Section from Previous Page:
"""

In [5]:
# Hàm chuyển PDF sang ảnh
def convert_pdf_to_images(pdf_directory, output_dir):
    pages_png = []
    
    for pdf_file in os.listdir(pdf_directory):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, pdf_file)
            # Chuyển PDF thành ảnh
            images = convert_from_path(pdf_path, use_pdftocairo=False, thread_count=15)
            # Tạo thư mục output cho từng PDF
            pdf_output_dir = os.path.join(output_dir, os.path.splitext(pdf_file)[0])
            os.makedirs(pdf_output_dir, exist_ok=True)
            # Lưu từng trang thành file ảnh
            for page_num, image in enumerate(images):
                page_filename = f"page-{str(page_num + 1).zfill(3)}.png"
                full_path = os.path.join(pdf_output_dir, page_filename)
                image.thumbnail((768, 768), Image.Resampling.LANCZOS)
                image.save(full_path)
                pages_png.append(full_path)
                
    return pages_png

In [6]:
# Hàm trích xuất thông tin từ ảnh using Gemini
previous_section_title = None  # Nội dung của trang trước đó

def extract_page_content(image_path, model, system_prompt):
    global previous_section_title  # Giữ nội dung của trang trước đó giữa các lần gọi hàm
    # Đọc ảnh
    image = Image.open(image_path)
    image.thumbnail((768, 768), Image.Resampling.LANCZOS)
    
    # Tạo prompt với context từ trang trước
    previous_section_title = previous_section_title if previous_section_title else "No previous page available"
    current_prompt = system_prompt + previous_section_title
    
    # Gọi Gemini API
    response = model.generate_content([
        current_prompt,
        image
    ])

    clean_output = response.text.strip("```json").strip("```").strip()
    
    # Parse JSON response
    result = json.loads(clean_output)
    previous_section_title = result[-1]['section']  # Lưu section của trang này để sử dụng cho trang tiếp theo

    # Lưu kết quả
    output_file = os.path.join(
        RESULT_DIR, 
        f"{os.path.basename(image_path).replace('.png', '.json')}"
    )
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
        
    return result

In [11]:
# Cell 6: Chạy toàn bộ quy trình
# 1. Chuyển PDF sang ảnh
# print("Converting PDFs to images...")
# image_paths = convert_pdf_to_images(PDF_DIR, OUTPUT_IMG_DIR)
# print(f"Generated {len(image_paths)} images")
def get_existing_images(output_dir):
    """Lấy danh sách các file ảnh đã tồn tại"""
    image_paths = []
    for root, _, files in os.walk(output_dir):
        for file in files:
            if file.endswith('.png'):
                image_paths.append(os.path.join(root, file))
    return sorted(image_paths)

# Kiểm tra xem đã có ảnh trong thư mục output chưa
image_paths = get_existing_images(OUTPUT_IMG_DIR)

if not image_paths:
    # Nếu chưa có ảnh, thực hiện chuyển đổi PDF
    print("No existing images found. Converting PDFs to images...")
    image_paths = convert_pdf_to_images(PDF_DIR, OUTPUT_IMG_DIR)
    print(f"Generated {len(image_paths)} images")
else:
    print(f"Found {len(image_paths)} existing images")


# # 2. Trích xuất thông tin từ từng ảnh
# print("\nExtracting content from images...")
# results = []
# import time

# for image_path in image_paths:
#     while True:
#         print(f"Processing {image_path}...")
#         result = extract_page_content(image_path, model, system_instruction_prompt)
#         if result:
#             results.append(result)
#         break  # Break the while loop if successful


# print(f"\nProcessed {len(results)} pages successfully")

No existing images found. Converting PDFs to images...
Generated 33 images
