### Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [45]:
import json
import re
from bs4 import BeautifulSoup
import os
import pandas as pd
from dictionary_construction.const import FIX_POS

### Path to HTML files

In [46]:
path_to_html = r'grammar_pages'
pages_list = os.listdir(path_to_html)

### Functions

In [47]:
def bunpro_adj_options(page_list: list) -> set:
    adj_options = set()
    for page in page_list:
        soup = BeautifulSoup(open(os.path.join(path_to_html, page), 'r', encoding='utf-8'), 'html.parser')
        title = soup.select('ul h4')
        labels = soup.select('ul > li > p')
        for t, l in zip(title, labels):
            if t.get_text(strip=True) == "Part of Speech":
                adj_options.add(l.get_text(strip=True))
    return sorted(adj_options)

In [48]:
def determaine_reading(soup: BeautifulSoup) -> str:
    try:
        reading = soup.select_one('h1 rt').get_text(strip=True)
    except:
        reading = soup.select_one('h1').get_text(strip=True)
    return reading

In [49]:
def extract_explination(soup: BeautifulSoup) -> str:
    main_div = soup.find('div', class_='bp-ddw bp-writeup-body prose')

    # Remove example sentence sections by specifying the classes you want to skip
    if main_div is None:
        return None
    for example_section in main_div.find_all(class_=['writeup-example--japanese', 'writeup-example--english']):
        example_section.decompose()  # This removes the element from the tree

    # Extract the remaining text
    explination = main_div.get_text(separator=' ', strip=True)
    return explination

In [50]:
def determine_pos(soup: BeautifulSoup) -> str:
    title = soup.select('ul h4')
    labels = soup.select('ul > li > p')
    for t, l in zip(title, labels):
        if t.get_text(strip=True) == "Part of Speech":
            result = l.get_text(strip=True)
            result = FIX_POS.get(result)
            if result:
                return result
            else:
                raise ValueError(f"Unknown POS: {result}")
    return "Unknown"

In [51]:
def JLPT_level(soup: BeautifulSoup) -> str:
    title = soup.find('title').get_text(strip=True)
    # Extract the JLPT level from the title
    jlpt = title.split('JLPT')[1].strip(') | Bunpro')
    if jlpt not in ['N5', 'N4', 'N3', 'N2', 'N1', 'N0']:
        raise ValueError(f"Unknown JLPT level: {jlpt}")
    return jlpt

In [52]:
def remove_latin_chars(text):
    # Regular expression to match Latin characters (a-z, A-Z)
    return re.sub(r'[a-zA-Z,]', '', text)

In [53]:
def contains_kanji(text):
    # Regular expression for Kanji (CJK Unified Ideographs)
    kanji_pattern = re.compile(r'[\u4E00-\u9FFF]')
    return bool(kanji_pattern.search(str(text)))  # Convert text to string in case of NaN

In [54]:
def compose_entry(subject, reading, part_of_speech, definition, explanation, matchup=10, jlpt="N5") -> str:
    data = [
        subject,  # Kanji
        reading,   # Kana
        part_of_speech,    # Part of speech 1
        part_of_speech,    # Part of speech 2
        matchup, 
        [
            {"type": "structured-content", "content": [
                "【 Meaning 】",
                {
                    "tag": "div",
                    "style": {"marginLeft": 1},
                    "content": definition
                },
                "【 Explination 】",
                {
                    "tag": "div",
                    "style": {"marginLeft": 1},
                    "content": explanation
                },
                "【 Example sentences 】",
                {
                    "tag": "ol",
                    "content": [
                        {"tag": "li", "style": {"listStyleType": "'①'"}, "content": "例文 1\nSentence 1"},
                        {"tag": "li", "style": {"listStyleType": "'②'"}, "content": "Sentence 2"}
                    ]
                }
            ]}
        ],
        1,  # Some boolean flag
        jlpt  # JLPT Level
    ]
    return data

In [55]:
def split_and_duplicate_rows(row):
    # Split the 'subject' column by the '・' character
    splits = row['subject'].split('・')
    # Duplicate the row for each split
    return pd.DataFrame({
        'subject': splits,
        'part_of_speech': [row['part_of_speech']] * len(splits),
        'definition': [row['definition']] * len(splits),
        'explanation': [row['explanation']] * len(splits),
        'JLPT': [row['JLPT']] * len(splits),
        'contains_kanji': [row['contains_kanji']] * len(splits)
    })

Used to create a mapping from bunpro to yomichan POS. Acessing all the pages takes a while so the results are saved in a dictionary in const.py

In [56]:
# # Used to create a mapping from bunpro to yomichan POS
# # results are saved in a dictionary in const.py
# bun_opt = bunpro_adj_options(pages_list)
# yomi_pos = ['adj-na','adj-na', 'adv', 'aux-v', 'prt', 'exp', 'adj-na', 'n' , 'prt', 'pn', 'v-unspec']
# fix_pos = {key: value for key, value in zip(bun_opt, yomi_pos)}
# print(fix_pos)
# assert len(bun_opt) == len(yomi_pos), "Lengths of bunpro and yomichan POS lists are not equal"

Pick an exmple page to find location of information with b4s

In [None]:
page_of_interest = '-%E3%82%93%E3%81%A7%E3%81%99-%E3%81%AE%E3%81%A7%E3%81%99.html'
index = 0
for i, page in enumerate(pages_list):
    if page == page_of_interest:
        index = i
        break
print(index)

In [58]:
# Read page with BeautifulSoup
soup = BeautifulSoup(open(os.path.join(path_to_html, page_of_interest), 'r', encoding='utf-8'), 'html.parser')

In [None]:
# Extract URL to link back to the original page
url = soup.select_one('head > link[rel="canonical"]')['href']
url

In [None]:
# One way to find the grammar topic, might not be the best way
subject = soup.find('h1').get_text(strip=True)
print(subject)

In [None]:
simple_def = soup.select_one('p.line-clamp-1').get_text(strip=True)
print(f"A simple definition of the grammar point is: {simple_def}")
explination = extract_explination(soup)
print(f"The explination of the grammar point is: {explination}")

Creating an exmaple entry in a dict. This dict can be passed straight to a JSON proccessing function or into a pandas dataframe for data analysis

In [None]:
entry_contents = {
    "subject": soup.find('h1').get_text(strip=True),
    "reading": determaine_reading(soup),
    "part_of_speech": determine_pos(soup),
    "definition": soup.select_one('p.line-clamp-1').get_text(strip=True),
    "explanation": extract_explination(soup),
    "link": soup.select_one('head > link[rel="canonical"]')['href']
}
print(entry_contents)

In [None]:
# Extract the JLPT level
jlpt = JLPT_level(soup)
print(jlpt)

Build a dataframe from the HTML files. This should be a temporary measure for examining the data. Future extraction flow show go straight from HTML to JSON.

In [None]:
df = pd.DataFrame(entry_contents, index=[0])
for page in pages_list:
    soup = BeautifulSoup(open(os.path.join(path_to_html, page), 'r', encoding='utf-8'), 'html.parser')
    entry_contents = {
        "subject": remove_latin_chars(soup.find('h1').get_text(strip=True).split(' ')[0]),
        "part_of_speech": determine_pos(soup),
        "definition": soup.select_one('p.line-clamp-1').get_text(strip=True),
        "explanation": extract_explination(soup),
        "JLPT": JLPT_level(soup)
        
    }
    df = pd.concat([df, pd.DataFrame(entry_contents, index=[0])], ignore_index=True)

Having a hard time with getting the readings. Possbily have to hand-write the kanji readings. Function to ID what entires to look at

In [35]:
df['contains_kanji'] = df['subject'].apply(contains_kanji)

In [36]:
# Function to split the rows based on '・' and duplicate other columns
expanded_df = pd.concat(df.apply(split_and_duplicate_rows, axis=1).tolist(), ignore_index=True)
expanded_df.shape

In [93]:
expanded_df.to_csv('example_extration.csv', index=False)

# Inspect dataframe

In [97]:
dict = pd.read_csv('bunpro_entries.csv')

In [None]:
# drop columns with subject = ~
dict = dict[dict['subject'] != '~']
dict[dict['definition'].duplicated(keep=False)].sort_values('definition')

### testing dict class

In [64]:
from dictionary_construction.Entry import Dictionary_Entry

In [67]:
test = Dictionary_Entry(soup)

In [68]:
print(test.subject)

んだ・んです
