# PDF Builder

In [None]:
import os
import json
from PyPDF2 import PdfReader, PdfWriter, PageObject
from reportlab.lib.pagesizes import A4
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas
from io import BytesIO


pdfmetrics.registerFont(
    TTFont("NotoSansJP-Regular", "./fonts/NotoSansJP-Regular.ttf")
)
pdfmetrics.registerFont(
    TTFont("NotoSansJP-Bold", "./fonts/NotoSansJP-Bold.ttf")
)

pdf_dir = "./pdf"

with open("parsed_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
def create_text_page(
    text: str,
    description: str = None,
    page_size: tuple[float, float] = A4,
) -> PageObject:
    packet = BytesIO()
    c = canvas.Canvas(packet, pagesize=page_size)
    width, height = page_size

    c.setFillColorRGB(8/255, 38/255, 48/255)
    c.rect(0, 0, width, height, fill=True)

    c.setFillColorRGB(1, 1, 1)
    c.setFont("NotoSansJP-Bold", 36)
    c.drawCentredString(width / 2, 2 * height / 3, text)

    if description:
        c.setFont("NotoSansJP-Regular", 16)
        c.drawCentredString(width / 2, 2 * height / 3 - 36 * 1.5, description)

    c.save()
    packet.seek(0)

    text_pdf = PdfReader(packet)

    return text_pdf.pages[0]


pdf_writer = PdfWriter()
pdf_reader = PdfReader("./title-page.pdf")
pdf_writer.add_page(pdf_reader.pages[0])

pdf_reader = PdfReader("./structure.pdf")
structure_page = pdf_reader.pages[0]

pdf_writer.add_page(structure_page)
pdf_writer.add_outline_item("Structure", 1)

page_counter = 2
bookmarks = []
item_page_by_url: dict[str, int] = {}

decks_to_skip_lessons_pages = {"Non-JLPT"}

for entry in data["data"]:
    deck_name = entry["deck"]
    deck_page = create_text_page(f"{deck_name} Deck")
    pdf_writer.add_page(deck_page)
    deck_outline = pdf_writer.add_outline_item(f"{deck_name} Deck", page_counter)
    skip_lesson_pages = deck_name in decks_to_skip_lessons_pages
    page_counter += 1

    for lesson in entry["lessons"]:
        title = lesson["title"]

        if not skip_lesson_pages:
            lesson_number = lesson["lesson_number"]
            lesson_page = create_text_page(f"Lesson {lesson_number}", title)
            pdf_writer.add_page(lesson_page)
            lesson_outline = pdf_writer.add_outline_item(
                f"{lesson_number}. {title}" if title else str(lesson_number),
                page_counter,
                deck_outline,
            )

        page_counter += 1

        for item in lesson["items"]:
            pdf_filename = item["filename"]
            pdf_filepath = os.path.join(pdf_dir, pdf_filename)

            item_name = item["name"]
            description = item["description"]
            item_url = item["url"]
            item_page_by_url[item_url] = page_counter

            if os.path.exists(pdf_filepath):
                pdf_reader = PdfReader(pdf_filepath)

                for page in pdf_reader.pages:
                    pdf_writer.add_page(page)

                pdf_writer.add_outline_item(
                    f"{item_name}: {description}", page_counter, deck_outline if skip_lesson_pages else lesson_outline
                )

                page_counter += len(pdf_reader.pages)
            else:
                print(f"Pdf file {pdf_filename} not found for item: {item_url}")

In [None]:
output_filename = "combined.pdf"

with open(output_filename, "wb") as output_pdf_file:
    pdf_writer.write(output_pdf_file)

print(f"PDF saved as {output_filename}")

In [None]:
import gc

del pdf_writer, pdf_reader, structure_page, page
gc.collect()

## Replace bunpro links to pdf internal

In [None]:
import requests
from bs4 import BeautifulSoup

def parse_grammar_point_ids(url: str) -> dict[str, str] | None:
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        data = {}
        items = soup.find_all("li", class_="search-tile_index")

        for item in items:
            item_id = item.get("id", "")
            if item_id.startswith("grammar-point-id-"):
                item_id = item_id.replace("grammar-point-id-", "")

            link_tag = item.find("a", href=True)
            if link_tag:
                href = link_tag["href"]
                data[item_id] = href[href.rfind("/") + 1 :]
            else:
                print(f"Missing url for item with id: {item_id}")

        return data
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

grammar_points_ids = parse_grammar_point_ids("https://bunpro.jp/grammar_points")

In [None]:
import fitz # PyMuPDF

output_filename_2 = "replaced_links.pdf"
pdf_document = fitz.open(output_filename)

for page_num in range(pdf_document.page_count):
    page = pdf_document[page_num]
    links = page.get_links()

    for link in links:
        uri = link.get("uri")
        if uri and uri.startswith("https://bunpro.jp/grammar_points/"):
            target_page = item_page_by_url.get(uri)

            # Replace '[' and ']' with %5B and %5D
            if not target_page and ('[' in uri or ']' in uri):
                modified_uri = uri.replace('[', '%5B').replace(']', '%5D')
                target_page = item_page_by_url.get(modified_uri)

            if not target_page:
                item_path = uri[uri.rfind("/") + 1 :]
                item_path_from_id = grammar_points_ids.get(item_path)
                if item_path_from_id:
                    uri = "https://bunpro.jp/grammar_points/" + item_path_from_id
                    target_page = item_page_by_url.get(uri)

            if target_page:
                if target_page != page_num:
                    page.delete_link(link)
                    page.insert_link(
                        {
                            "kind": fitz.LINK_GOTO,
                            "page": target_page,
                            "from": link["from"],
                        }
                    )
            else:
                print(f"Can not resolve link to item: {uri}")

pdf_document.save(output_filename_2)
pdf_document.close()

print(f"PDF saved as {output_filename_2}")

## Linearize a PDF to enable faster loading (Optional)

In [None]:
import pikepdf

output_filename_3 = "Bunpro-Grammar-Book.pdf"

with pikepdf.open(output_filename_2) as pdf:
    pdf.save(output_filename_3, linearize=True)

print(f"PDF saved as {output_filename_3}")