### Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [45]:
import json
import re
from bs4 import BeautifulSoup
import os
import pandas as pd
from dictionary_construction.const import FIX_POS

### Path to HTML files

In [46]:
path_to_html = r'grammar_pages'
pages_list = os.listdir(path_to_html)

### Functions

In [13]:
def bunpro_adj_options(page_list: list) -> set:
    adj_options = set()
    for page in page_list:
        soup = BeautifulSoup(open(os.path.join(path_to_html, page), 'r', encoding='utf-8'), 'html.parser')
        title = soup.select('ul h4')
        labels = soup.select('ul > li > p')
        for t, l in zip(title, labels):
            if t.get_text(strip=True) == "Part of Speech":
                adj_options.add(l.get_text(strip=True))
    return sorted(adj_options)

In [14]:
def determaine_reading(soup: BeautifulSoup) -> str:
    try:
        reading = soup.select_one('h1 rt').get_text(strip=True)
    except:
        reading = soup.select_one('h1').get_text(strip=True)
    return reading

In [33]:
def extract_explination(soup: BeautifulSoup) -> str:
    main_div = soup.find('div', class_='bp-ddw bp-writeup-body prose')

    # Remove example sentence sections by specifying the classes you want to skip
    if main_div is None:
        return None
    for example_section in main_div.find_all(class_=['writeup-example--japanese', 'writeup-example--english']):
        example_section.decompose()  # This removes the element from the tree

    # Extract the remaining text
    explination = main_div.get_text(separator=' ', strip=True)
    return explination

In [16]:
def determine_pos(soup: BeautifulSoup) -> str:
    title = soup.select('ul h4')
    labels = soup.select('ul > li > p')
    for t, l in zip(title, labels):
        if t.get_text(strip=True) == "Part of Speech":
            result = l.get_text(strip=True)
            result = FIX_POS.get(result)
            if result:
                return result
            else:
                raise ValueError(f"Unknown POS: {result}")
    return "Unknown"

In [17]:
def JLPT_level(soup: BeautifulSoup) -> str:
    title = soup.find('title').get_text(strip=True)
    # Extract the JLPT level from the title
    jlpt = title.split('JLPT')[1].strip(') | Bunpro')
    if jlpt not in ['N5', 'N4', 'N3', 'N2', 'N1', 'N0']:
        raise ValueError(f"Unknown JLPT level: {jlpt}")
    return jlpt

In [18]:
def remove_latin_chars(text):
    # Regular expression to match Latin characters (a-z, A-Z)
    return re.sub(r'[a-zA-Z,]', '', text)

In [19]:
def contains_kanji(text):
    # Regular expression for Kanji (CJK Unified Ideographs)
    kanji_pattern = re.compile(r'[\u4E00-\u9FFF]')
    return bool(kanji_pattern.search(str(text)))  # Convert text to string in case of NaN

In [20]:
def compose_entry(subject, reading, part_of_speech, definition, explanation, matchup=10, jlpt="N5") -> str:
    data = [
        subject,  # Kanji
        reading,   # Kana
        part_of_speech,    # Part of speech 1
        part_of_speech,    # Part of speech 2
        matchup, 
        [
            {"type": "structured-content", "content": [
                "【 Meaning 】",
                {
                    "tag": "div",
                    "style": {"marginLeft": 1},
                    "content": definition
                },
                "【 Explination 】",
                {
                    "tag": "div",
                    "style": {"marginLeft": 1},
                    "content": explanation
                },
                "【 Example sentences 】",
                {
                    "tag": "ol",
                    "content": [
                        {"tag": "li", "style": {"listStyleType": "'①'"}, "content": "例文 1\nSentence 1"},
                        {"tag": "li", "style": {"listStyleType": "'②'"}, "content": "Sentence 2"}
                    ]
                }
            ]}
        ],
        1,  # Some boolean flag
        jlpt  # JLPT Level
    ]
    return data

In [21]:
def split_and_duplicate_rows(row):
    # Split the 'subject' column by the '・' character
    splits = row['subject'].split('・')
    # Duplicate the row for each split
    return pd.DataFrame({
        'subject': splits,
        'part_of_speech': [row['part_of_speech']] * len(splits),
        'definition': [row['definition']] * len(splits),
        'explanation': [row['explanation']] * len(splits),
        'JLPT': [row['JLPT']] * len(splits),
        'contains_kanji': [row['contains_kanji']] * len(splits)
    })

Used to create a mapping from bunpro to yomichan POS. Acessing all the pages takes a while so the results are saved in a dictionary in const.py

In [24]:
# # Used to create a mapping from bunpro to yomichan POS
# # results are saved in a dictionary in const.py
# bun_opt = bunpro_adj_options(pages_list)
# yomi_pos = ['adj-na','adj-na', 'adv', 'aux-v', 'prt', 'exp', 'adj-na', 'n' , 'prt', 'pn', 'v-unspec']
# fix_pos = {key: value for key, value in zip(bun_opt, yomi_pos)}
# print(fix_pos)
# assert len(bun_opt) == len(yomi_pos), "Lengths of bunpro and yomichan POS lists are not equal"

### Pick an exmple page to find location of information with b4s

In [25]:
page_of_interest = '%E3%81%8B%E3%81%AE%E3%82%88%E3%81%86%E3%81%A0.html'
index = 0
for i, page in enumerate(pages_list):
    if page == page_of_interest:
        index = i
        break
print(index)

876


In [26]:
# Read page with BeautifulSoup
soup = BeautifulSoup(open(os.path.join(path_to_html, page_of_interest), 'r', encoding='utf-8'), 'html.parser')

In [27]:
# Extract URL to link back to the original page
url = soup.select_one('head > link[rel="canonical"]')['href']
url

'https://bunpro.jp/grammar_points/-%25E3%2582%2593%25E3%2581%25A7%25E3%2581%2599-%25E3%2581%25AE%25E3%2581%25A7%25E3%2581%2599'

In [28]:
# One way to find the grammar topic, might not be the best way
subject = soup.find('h1').get_text(strip=True)
print(subject)

んだ・んです


In [29]:
simple_def = soup.select_one('p.line-clamp-1').get_text(strip=True)
print(f"A simple definition of the grammar point is: {simple_def}")
explination = extract_explination(soup)
print(f"The explination of the grammar point is: {explination}")

A simple definition of the grammar point is: Explanatory, Emphasis, The fact is
The explination of the grammar point is: のです is a construction with several different forms, each meaning roughly the same thing. This expression behaves similarly to だ and です by themselves (asserting that something is true). However, the addition of の strengthens the relationship being highlighted. Due to this, it will come across a bit like 'it is a fact that (A)' in English. In these sentences, we can think of the ん , or の as simply adding emphasis to the statement. Due to の adding emphasis, this expression will be heard very frequently in explanations. Caution There is no real difference between ん and の . Only の is considered 'correct Japanese', but ん evolved to make things easier to say. ん is the most common form in spoken Japanese, and the use of の could be considered a bit formal, or 'stiff'. While both the だ and です forms are equally as natural, only the です form will be used if adding か to the end ～ん

Find example sentances

In [77]:
example_sentances = soup.find_all(id=re.compile(r'^study-question-\d+$'))

In [84]:
example_sentances

[<div class="not-prose relative my-0 block overflow-hidden rounded border bg-primary-bg p-16 align-top sm:flex sm:items-center sm:justify-between sm:gap-4 sm:p-24 border-primary-bg" id="study-question-7492">
 <ul class="relative z-1 hidden sm:flex sm:items-center sm:gap-4">
 <button class="block transition-opacity text-primary-accent" title="Play audio">
 <div class="bp-hover-bg__child rounded p-6">
 <div class="relative">
 <span aria-hidden="true" class="bp-skeleton block undefined" style="width:24px;height:24px">
 </span>
 </div>
 </div>
 </button>
 </ul>
 <div class="relative z-1 flex flex-grow flex-col items-center justify-center gap-4 text-center">
 <p class="bp-ddw prose text-large md:text-subtitle">
               その
               <ruby>
                作品
                <rp>
                 (
                </rp>
 <rt>
                 さくひん
                </rt>
 <rp>
                 )
                </rp>
 </ruby>
               は、あの
               <ruby>
               

In [96]:
for s in example_sentances:
    text = s.get_text(strip=True)
    japanese_text = re.match(r'^[^A-Za-z]+', text)
    print(japanese_text.group())
# jp_sentances = [s.get_text(strip=True).split('。',)[0] + '。' for s in example_sentances]

その作品は、あの有名な作品を真似たかのようですね。
彼はまるでお腹に風船を入れているかのようだ。
あの会社は営業中も休業しているかのようだ。
彼は相当目がいい。望遠鏡を使っているかのようだ。
あのシャツの色と柄じゃあ、包帯をしているかのようですね。


In [83]:
for s in jp_sentances:
    print(s)

その作品は、あの有名な作品を真似たかのようですね。
彼はまるでお腹に風船を入れているかのようだ。
あの会社は営業中も休業しているかのようだ。
彼は相当目がいい。
あのシャツの色と柄じゃあ、包帯をしているかのようですね。


In [76]:

# Current output text
english_sentances = []
jp_sentances = []
for sentance in example_sentances:
    sentance = sentance.select_one('p.bp-sdw.undefined').get_text(strip=False).replace('\n', ' ')
    sentance = re.sub(r'\s+', ' ', sentance)
    english_sentances.append(sentance)


 That item looks as if it is imitating that other famous item. 
 It is just as if he has a balloon in his belly. 
 That company looks as if they are closed even during operation hours. 
 He has extremely good eyesight. It is as if he is using a telescope. 
 That shirt's color and patterns makes it look as if you are wearing a bandage. 


In [52]:
# Replace multiple spaces with a single space
cleaned_text = re.sub(r'\s+', ' ', text)
print(cleaned_text)

 It is just as if he has a balloon in his belly. 


### Creating an exmaple entry in a dict. This dict can be passed straight to a JSON proccessing function or into a pandas dataframe for data analysis

In [None]:
entry_contents = {
    "subject": soup.find('h1').get_text(strip=True),
    "reading": determaine_reading(soup),
    "part_of_speech": determine_pos(soup),
    "definition": soup.select_one('p.line-clamp-1').get_text(strip=True),
    "explanation": extract_explination(soup),
    "link": soup.select_one('head > link[rel="canonical"]')['href']
}
print(entry_contents)

In [None]:
# Extract the JLPT level
jlpt = JLPT_level(soup)
print(jlpt)

Build a dataframe from the HTML files. This should be a temporary measure for examining the data. Future extraction flow show go straight from HTML to JSON.

In [None]:
df = pd.DataFrame(entry_contents, index=[0])
for page in pages_list:
    soup = BeautifulSoup(open(os.path.join(path_to_html, page), 'r', encoding='utf-8'), 'html.parser')
    entry_contents = {
        "subject": remove_latin_chars(soup.find('h1').get_text(strip=True).split(' ')[0]),
        "part_of_speech": determine_pos(soup),
        "definition": soup.select_one('p.line-clamp-1').get_text(strip=True),
        "explanation": extract_explination(soup),
        "JLPT": JLPT_level(soup)
        
    }
    df = pd.concat([df, pd.DataFrame(entry_contents, index=[0])], ignore_index=True)

Having a hard time with getting the readings. Possbily have to hand-write the kanji readings. Function to ID what entires to look at

In [35]:
df['contains_kanji'] = df['subject'].apply(contains_kanji)

In [36]:
# Function to split the rows based on '・' and duplicate other columns
expanded_df = pd.concat(df.apply(split_and_duplicate_rows, axis=1).tolist(), ignore_index=True)
expanded_df.shape

In [93]:
expanded_df.to_csv('example_extration.csv', index=False)

# Inspect dataframe

In [97]:
dict = pd.read_csv('bunpro_entries.csv')

In [None]:
# drop columns with subject = ~
dict = dict[dict['subject'] != '~']
dict[dict['definition'].duplicated(keep=False)].sort_values('definition')

### testing dict class

In [69]:
from dictionary_construction.Entry import Dictionary_Entry

In [70]:
test = Dictionary_Entry(soup)

In [71]:
print(test.subject)

んだ・んです
