# Bunpro deck parser

In [None]:
import requests
from bs4 import BeautifulSoup

def parse_grammar_points(url: str) -> list | None:
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        data = []
        decks = soup.find_all("li", class_="search-container_level")

        for deck in decks:
            deck_name = deck.find("h2").get_text(strip=True)
            deck_lessons_title_divs = deck.find_all("div", class_="index-lesson-data")
            deck_lessons_tile_sets = deck.find_all("ul", class_="search-container_tiles")

            assert len(deck_lessons_title_divs) == len(deck_lessons_tile_sets)

            lesson_titles = []
            for title_div in deck_lessons_title_divs:
                title_span = title_div.find("span")
                lesson_titles.append(title_span.get_text(strip=True))

            lessons_data = []
            for index, tile_set in enumerate(deck_lessons_tile_sets):
                items = []
                tiles = tile_set.find_all("li", class_="search-tile_index")
                for item in tiles:
                    href = item.find("a")["href"]
                    items.append(href[href.rfind("/") + 1:])

                lessons_data.append({
                    "lesson": index + 1,
                    "title": lesson_titles[index],
                    "items": items
                })

            data.append({
                "deck": deck_name,
                "lessons": lessons_data
            })

        return data
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

data = parse_grammar_points("https://bunpro.jp/grammar_points")

In [None]:
import json
import os


with open("grammar_points.json", "w") as f:
    json.dump({"data": data}, f, indent=4, sort_keys=True)

with open("./script.js", "r") as f:
    script = f.read()

pdf_dir = "./pdf"
os.makedirs(pdf_dir, exist_ok=True)

In [None]:
import time
import base64
from tqdm import tqdm
from urllib.parse import unquote
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(options=chrome_options)

scale = 0.75
print_options = {
    'landscape': False,
    'paperWidth': 8.27,
    'paperHeight': 11.69,
    "marginTop": 0.25,
    "marginBottom": 0.25,
    "marginLeft": 0,
    "marginRight": 0,
    "printBackground": False,
}

retries = 3


def remove_furigana(html_content):
    soup = BeautifulSoup(html_content, "html.parser")

    for rt in soup.find_all("rt"):
        rt.decompose()

    return soup.get_text()


def process_item(driver, item, retries, pdf_dir, script_to_execute, print_options):
    url = "https://bunpro.jp/grammar_points/" + item
    for attempt in range(1, retries + 1):
        try:
            driver.get(url)
            try:
                WebDriverWait(driver, 30).until(
                    lambda d: d.execute_script("return document.readyState")
                    == "complete"
                )
            except TimeoutException:
                raise Exception("Page did not fully load within 30 seconds")

            try:
                WebDriverWait(driver, 30).until(
                    lambda d: "bp-fade-out-anim"
                    in d.find_element(
                        By.CLASS_NAME, "LoadingSpinnerFullscreen"
                    ).get_attribute("class")
                )
            except TimeoutException:
                raise Exception("Loading spinner did not disappear within 30 seconds")

            driver.execute_script(script_to_execute)
            time.sleep(1)

            item_name = remove_furigana(
                driver.find_element(
                    By.CSS_SELECTOR, "#js-rev-header h1 span.text-primary-accent"
                ).get_attribute("outerHTML")
            )

            item_description = remove_furigana(
                driver.find_element(
                    By.CSS_SELECTOR, "#js-rev-header h1 span.mt-4"
                ).get_attribute("outerHTML")
            )

            filename = f"{unquote(item)}.pdf"
            pdf = driver.execute_cdp_cmd("Page.printToPDF", print_options)

            with open(os.path.join(pdf_dir, filename), "wb") as fh:
                fh.write(base64.b64decode(pdf['data']))

            return {
                "name": item_name,
                "description": item_description,
                "filename": filename,
                "url": url,
            }
        except Exception as e:
            print(
                f"Failed to parse for: {url} ({attempt}/{retries}). {str(e).splitlines()[0]}"
            )
            if attempt == retries:
                return None

            driver.get("https://bunpro.jp")
            time.sleep(5)

In [None]:
# OPTIONAL authorization for users with subscription for more example sentences
auth_email = ""
auth_password = ""

is_login_successfull = False

if auth_email and auth_password:
    driver.get("https://bunpro.jp/login")
    time.sleep(10)

    password_field = driver.find_element(By.ID, "user_password")
    password_field.send_keys(auth_password)
    time.sleep(0.2)

    email_field = driver.find_element(By.ID, "user_email")
    email_field.send_keys(auth_email)
    time.sleep(0.2)

    login_button = driver.find_element(By.CSS_SELECTOR, "input[value='Log in']")
    login_button.click()

    time.sleep(10)

    if driver.current_url == "https://bunpro.jp/dashboard":
        is_login_successfull = True
        print("Login successful")
    else:
        print("Login failed. Try to run this cell again")
        driver.get("https://bunpro.jp")
else:
    print("Login skipped")

if is_login_successfull:
    script_to_execute = script + f"convertToPrintVersion({scale}, true);"
else:
    script_to_execute = script + f"convertToPrintVersion({scale}, false);"

In [None]:
failed_items = []
data_extended = []
for deck in data:
    deck_name = deck["deck"]
    deck_lessons = deck["lessons"]

    total_items = 0
    for lesson in deck_lessons:
        total_items += len(lesson["items"])

    with tqdm(total=total_items, desc=f"Processing {deck_name}") as progress_bar:
        extended_lessons_data = []

        for lesson in deck_lessons:
            lesson_number = lesson["lesson"]
            title = lesson["title"]
            items = lesson["items"]

            items_extended = []

            for item in items:
                result = process_item(
                    driver,
                    item,
                    retries,
                    pdf_dir,
                    script_to_execute,
                    print_options,
                )

                if result:
                    items_extended.append(result)
                else:
                    failed_items.append((deck_name, lesson_number, item))

                progress_bar.update(1)

            extended_lessons_data.append({
                "lesson_number": lesson_number,
                "title": title,
                "items": items_extended
            })

    data_extended.append({
        "deck": deck_name,
        "lessons": extended_lessons_data
    })

In [None]:
with open("parsed_data.json", "w") as f:
    json.dump({"data": data_extended}, f, indent=4, sort_keys=True)

In [None]:
driver.quit()