### Imports

In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import json
import os

### Load data & Parse data

In [2]:
with open('../scraped_page.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

In [3]:
li_elements = soup.select('li.search-container_level.u-flex.u-flex_column.u-gap_16.u-border_rim.u-rounded.u-bg_bg-secondary')
n_levels = {
    'N5': [],
    'N4': [],
    'N3': [],
    'N2': [],
    'N1': [],
    'Non-JLPT': []
}


In [4]:
for element in li_elements:
    # Find the level from the <h2> tag
    level = element.select_one('h2').text.strip()

    # Check if level is in the dictionary (to avoid KeyError)
    if level in n_levels:
        # Find all <li> elements with the 'data-grammar-point' attribute
        a = element.select('li[data-grammar-point]')
        a
        # Extract 'data-grammar-point' attribute values
        grammar_points = [{item['data-grammar-point']: item.select_one('a')['href']} for item in a]

        # Append the extracted grammar points to the correct level
        n_levels[level] = grammar_points

# Print the dictionary to see the populated values
for level, grammar_points in n_levels.items():
    print(f"Level {level} has {len(grammar_points)} grammar points")

Level N5 has 126 grammar points
Level N4 has 177 grammar points
Level N3 has 219 grammar points
Level N2 has 213 grammar points
Level N1 has 180 grammar points
Level Non-JLPT has 12 grammar points


In [5]:
grammar_pages_path = os.path.abspath('../grammar_pages')
os.path.isdir(grammar_pages_path)
done = os.listdir(grammar_pages_path)
done = [s.split('.')[0] for s in done]

In [6]:
to_pop = []
for level, grammar_points in n_levels.items():
    for j, grammar_point in enumerate(grammar_points):
        for i, (key, value) in enumerate(grammar_point.items()):
            print(f"Level {level}: {key} - {value}")


                

Level N5: だ - /grammar_points/%E3%81%A0
Level N5: です - /grammar_points/%E3%81%A7%E3%81%99
Level N5: は - /grammar_points/%E3%81%AF
Level N5: も - /grammar_points/%E3%82%82
Level N5: これ - /grammar_points/%E3%81%93%E3%82%8C
Level N5: それ - /grammar_points/%E3%81%9D%E3%82%8C
Level N5: あれ - /grammar_points/%E3%81%82%E3%82%8C
Level N5: の - /grammar_points/%E3%81%AE
Level N5: いい - /grammar_points/%E3%81%84%E3%81%84
Level N5: い-Adjectives - /grammar_points/%E3%81%84-adjectives
Level N5: な-Adjectives - /grammar_points/%E3%81%AA-adjectives
Level N5: か - /grammar_points/%E3%81%8B
Level N5: が - /grammar_points/%E3%81%8C
Level N5: よ - /grammar_points/%E3%82%88
Level N5: ね - /grammar_points/%E3%81%AD
Level N5: る-Verb (Dictionary) - /grammar_points/%E3%82%8B-Verbs
Level N5: う-Verb (Dictionary) - /grammar_points/%E3%81%86-Verbs
Level N5: を - /grammar_points/%E3%82%92
Level N5: る-Verb (Negative) - /grammar_points/%E3%82%8Bverb-%E3%81%AA%E3%81%84
Level N5: う-Verb (Negative) - /grammar_points/%E3%81%86verb

In [7]:
# save the dictionary to a file
with open('grammar_points.json', 'w', encoding='utf-8') as file:
    json.dump(n_levels, file, ensure_ascii=False, indent=4)

In [16]:
def get_scrape_urls(json_path: str, n_level: str) -> list:
    """Generate a list of URLs to scrape based on grammar points."""
    try:
        with open(json_path, "r", encoding="utf-8") as file:
            data = json.load(file)
    except FileNotFoundError:
        return []
    except json.JSONDecodeError:
        return []

    base_url = "https://bunpro.jp"
    grammar_points = data.get(n_level, [])
    
    scrape_sites = []
    for grammar_point in grammar_points:
        for url in grammar_point.values():
            scrape_sites.append(base_url + url)
    
    return scrape_sites

In [17]:
with open('grammar_points.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

get_scrape_urls('grammar_points.json', 'N5')[:5]

['https://bunpro.jp/grammar_points/%E3%81%A0',
 'https://bunpro.jp/grammar_points/%E3%81%A7%E3%81%99',
 'https://bunpro.jp/grammar_points/%E3%81%AF',
 'https://bunpro.jp/grammar_points/%E3%82%82',
 'https://bunpro.jp/grammar_points/%E3%81%93%E3%82%8C']