# Scrape data for Japanese Translation Activity project

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re



In [2]:
BASE_URL = "https://bunpro.jp"
GRAMMAR_POINTS_URL = "https://bunpro.jp/grammar_points"
headers = {}



In [3]:
grammar_points_page = requests.get(GRAMMAR_POINTS_URL)

soup = BeautifulSoup(grammar_points_page.text, 'html.parser')

grammar_points_list = []

for child in soup.find_all(attrs={"class":"u-position_absolute u-inset_0"}, recursive=True):
    grammar_points_list.append(child.attrs["href"])

In [None]:
print(grammar_points_list)

grammar_data_df = pd.DataFrame(columns=["level", "grammar_point_jp", "meaning_en", "description"])
examples_df = pd.DataFrame(columns=["grammar_id", "text", "grammar_char_locations"])

# "text" field format: [["word", "furigana"], ["word", "furigana"], ...]
# "grammar_char_location": indeces where the grammar point appears in the text, for later highlighting

['/grammar_points/%E3%81%A0', '/grammar_points/%E3%81%A7%E3%81%99', '/grammar_points/%E3%81%AF', '/grammar_points/%E3%82%82', '/grammar_points/%E3%81%93%E3%82%8C', '/grammar_points/%E3%81%9D%E3%82%8C', '/grammar_points/%E3%81%82%E3%82%8C', '/grammar_points/%E3%81%AE', '/grammar_points/%E3%81%84%E3%81%84', '/grammar_points/%E3%81%84-adjectives', '/grammar_points/%E3%81%AA-adjectives', '/grammar_points/%E3%81%8B', '/grammar_points/%E3%81%8C', '/grammar_points/%E3%82%88', '/grammar_points/%E3%81%AD', '/grammar_points/%E3%82%8B-Verbs', '/grammar_points/%E3%81%86-Verbs', '/grammar_points/%E3%82%92', '/grammar_points/polite-verb-endings', '/grammar_points/%E3%82%8Bverb-%E3%81%AA%E3%81%84', '/grammar_points/%E3%81%86verb--%E3%81%AA%E3%81%84', '/grammar_points/%E3%81%A8-and', '/grammar_points/%E3%81%93%E3%81%93', '/grammar_points/%E3%81%9D%E3%81%93', '/grammar_points/%E3%81%82%E3%81%9D%E3%81%93', '/grammar_points/%E3%81%A7', '/grammar_points/%E3%81%AB', '/grammar_points/%E3%81%8B-or', '/gramma

In [39]:
def extract_text(ex: str):
    ###
    # RETURNS: a 2d-array containing all sentence segments - in the format [[kanji, furigana, hira], [kanji, furigana, hira], ...]
    #          either kanji and furigana are None when the segment is hira text, and hira is None when the segment is a kanji word with furigana

    text = []
    tag_starts = [m.start() for m in re.finditer('<', ex)]
    tag_closes = [m.start() for m in re.finditer('>', ex)]
    word = ''
    furigana = ''
    hira = ''
    # <span class="bp-ddw undefined"><ruby>聞<rp>(</rp><rt>き</rt><rp>)</rp></ruby>こえる</span>
    
    for i in range(len(tag_starts)-1):
        if ex[tag_starts[i]+1:tag_closes[i]] == 'ruby': # upcoming word has furigana attached
            word = ex[tag_closes[i]+1:tag_starts[i+1]]
        elif ex[tag_starts[i]+1:tag_closes[i]] == 'rt':
            furigana = ex[tag_closes[i]+1:tag_starts[i+1]]
            text.append([word, furigana, None]) # append the kanji and furigana after furigana appears
            
            word = '' # reset these for new kanji word and furigana
            furigana = ''
        else:
            if not ex[tag_starts[i]+1:tag_closes[i]] == 'rp': # rp is a tag that is only used around parentheses so do not need it
                hira = ex[tag_closes[i]+1:tag_starts[i+1]] # regular hiragana text appears

                if not hira == '' and not hira == '\n': # all tags produce some text, even if it is empty, so filter it out
                    text.append([None, None, hira])
                
    return text

extract_text('<p class="bp-ddw prose text-large md:text-subtitle">「<ruby>今<rp>(</rp><rt>いま</rt><rp>)</rp></ruby><span class="gp-popout" data-gp-id="6">の</span>ノック<span class="study-area-input"><span class="text-primary-accent"><ruby>聞<rp>(</rp><rt>き</rt><rp>)</rp></ruby>こえ</span></span><strong>た</strong>？」<br/>「ううん、<span class="study-area-input"><span class="text-primary-accent"><ruby>聞<rp>(</rp><rt>き</rt><rp>)</rp></ruby>こえ</span></span><strong><span class="gp-popout" data-gp-id="896">なかった</span></strong><span class="gp-popout" data-gp-id="78">よ</span>。<ruby>空耳<rp>(</rp><rt>そらみみ</rt><rp>)</rp></ruby><span class="gp-popout" data-gp-id="49">じゃない</span>？」</p>')

[[None, None, '「'],
 ['今', 'いま', None],
 [None, None, 'の'],
 [None, None, 'ノック'],
 ['聞', 'き', None],
 [None, None, 'こえ'],
 [None, None, 'た'],
 [None, None, '？」'],
 [None, None, '「ううん、'],
 ['聞', 'き', None],
 [None, None, 'こえ'],
 [None, None, 'なかった'],
 [None, None, 'よ'],
 [None, None, '。'],
 ['空耳', 'そらみみ', None],
 [None, None, 'じゃない'],
 [None, None, '？」']]

In [None]:
for url in grammar_points_list:
    
    full_url = BASE_URL + url
    # full_url = "https://bunpro.jp/grammar_points/%E3%82%92"
    # full_url = "https://bunpro.jp/grammar_points/%E8%81%9E%E3%81%93%E3%81%88%E3%82%8B"
    page_data = requests.get(full_url)

    soup = BeautifulSoup(page_data.text, "html.parser")

    # TODO: FILL IN GRAMMAR POINT INFORMATION AND ADD TO grammar_data_df
    # USE THAT ID FOR examples_df ENTRIES

    level = soup.find("p", {"class": "text-primary-contrast md:text-primary-fg"})
    level = str(level.text)[1:2]

    grammar_point_jp = soup.find("span", {"class": "bp-ddw undefined"})
    grammar_point_jp = extract_text(str(grammar_point_jp))

    meaning_en = soup.find("span", {"class": "mt-4 block text-body text-primary-contrast md:text-subtitle md:text-primary-fg"})
    meaning_en = meaning_en.text

    description = soup.find("div", {"class": "bp-ddw bp-writeup-body prose"})
    description = extract_text(str(description))
    print(description)

    grammar_data = {"level": level, "grammar_point_jp": grammar_point_jp, "meaning_en": meaning_en, "description": description}

    grammar_data_df.loc[len(grammar_data_df)] = grammar_data

    for ex in soup.find_all("p", {"class": "bp-ddw prose text-large md:text-subtitle"}):
        text = extract_text(str(ex))

        

    

[['聞', 'き', None], [None, None, 'こえる'], [None, None, " is a verb that is often used to describe things that can be heard, or the way in which something is heard. Because of this, the most common translations of this verb are 'to be audible', or 'to sound like (A)'. "], ['聞', 'き', None], [None, None, 'こえる'], [None, None, " is an intransitive verb, and literally means 'to give off sound'."], [None, None, 'When using '], ['聞', 'き', None], [None, None, 'こえる'], [None, None, ', (A) will always be followed by '], [None, None, 'が'], [None, None, ". (A) is considered to be the 'source' of the sound (a noun). However, the 'way that something sounds' will be marked adverbially. This means that an "], [None, None, 'い-Adjective'], [None, None, ' will be changed to its く form, and '], [None, None, 'な-Adjectives'], [None, None, '/nouns will be followed by '], [None, None, 'に'], [None, None, '.'], [None, None, 'Fun Fact'], [None, None, 'The primary difference between '], ['聞', 'き', None], [None, None,

In [42]:
grammar_data_df

Unnamed: 0,level,grammar_point_jp,meaning_en,description
0,4,"[[聞, き, None], [None, None, こえる]]","To be audible, To sound (like), (Can) hear, To...","[[聞, き, None], [None, None, こえる], [None, None,..."
