In [68]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import numpy as np

CHROME_DRIVER_PATH = "/home/laurinbrechter/Documents/chromedriver"
API_KEY = "f6b2c0d8ac8bf83973a8cea901768bbd"

In [47]:
def get_dynamic_html(URL):
    driver = webdriver.Chrome(executable_path = CHROME_DRIVER_PATH)
    driver.get(URL)
    soup = BeautifulSoup(driver.page_source)
    driver.close()

    return soup

In [91]:
def parse_word_container(word_container):
    """
    Parses a single word container.
    """
    chracter_container = word_container.find(class_="col-md-3 word-container")
    characters = [i.text for i in chracter_container.find("a").find_all("span")]

    meanings = []
    for child in list(word_container.find(class_="col-md-7").find("p").children)[2:]:
        if child.name == "a":
            meanings.append(child.text)

    # meanings = [i.text for i in word_container.find(class_="col-md-7").find("p").find_all("a")[1:-1]]

    word_info = word_container.find(class_="pull-right").find_all("span")

    word_type = word_info[0].text

    for info in word_info:
        if "HSK" in info.text:
            hsk_level = info.text
            break
    else:
        hsk_level = np.nan
    

    word_frequency_stars = len(word_info[-2].find_all("i"))

    return characters, meanings, word_type, hsk_level, word_frequency_stars

In [99]:
def parse_word(soup:str):

    meaning = []

    char_container = soup.find(id="charDef")
    
    pinyin = char_container.find(class_="arch-pinyin-font").text
    
    radical = char_container.find("span", string='»\xa0Radical:\xa0\xa0').nextSibling.text
    
    try:
        hsk_level = char_container.find("a", string="HSK").parent.nextSibling.text
    except:
        hsk_level = np.nan
    
    usage = len(list(soup.find("span", {"style": "color:#CCCCCC;"}).children))
    

    for child in list(char_container.children)[1:]:
        if child.name == "a":
            meaning.append(child.text.strip())
        elif child == ", ":
            continue
        else:
            break
    
    word_table = soup.find(class_="table table-condensed")


    # indicates wrong html
    if not word_table:
        return "error"


    words = word_table.find_all("tr")[1:]

    word_information = []

    for word in words:
        try:
            # if not word.get("id"):
            word_information.append(parse_word_container(word))
        except:
            continue

    parsed_word_information = pd.DataFrame(word_information, columns=["characters", "meanings", "word_type", "hsk_level", "word_frequency_stars"])
        
    return pinyin, radical, hsk_level, usage, meaning, parsed_word_information

In [105]:
words = pd.read_csv("data/hanzi_table.csv")
l1_words = words.loc[words["level"] == 1]
t = list(l1_words["Hanzi"][50:100])

In [106]:

def scrape(character_list, backend="web"):

    parsed_characters = []
    parsed_words = []
    for char in tqdm(character_list[10:]):
        print(char, f"https://www.archchinese.com/chinese_english_dictionary.html?find={char}")
        
        if backend=="web":
            soup = get_dynamic_html(f"https://www.archchinese.com/chinese_english_dictionary.html?find={char}")
            with open(f"html_data/html_{char}.html", "w") as file:
                file.write(str(soup))
        elif backend=="local":
            with open(f"html_data/html_{char}.html") as f:
                x = f.read()
            soup = BeautifulSoup(x)
        else:
            return

        result = parse_word(soup)

        if result == "error":
            print("error with given html, retrying")
            while result == "errror":
                result = parse_word(soup)


        parsed_words.append(result[-1])
        parsed_characters.append(result[:-1] + (char,))
        
        pd.DataFrame(pd.concat(parsed_words)).to_csv("words.csv", index=False)
        pd.DataFrame(parsed_characters, columns=["pinyin", "radical", "hsk_level", "usage", "meaning", "character"]).to_csv("chars.csv", index=False)

In [None]:
scrape(t, backend="web")

In [53]:
df = pd.DataFrame(parsed_characters)
dict(zip(df[1], df[0]))

{'丶': 'wán', '又': 'jí', '广': 'guǎng', '亠': 'wáng', '门': 'mén'}

In [38]:
{i[1]:i[0] for i in parsed_characters}

{'丶': 'wán', '又': 'jí', '广': 'guǎng', '亠': 'wáng', '门': 'mén'}

In [50]:
pd.concat(parsed_words).to_csv("test-parse.csv", index=False)

In [40]:
def find_text(l, text):
    for i in l:
        if i.text == text:
            return i

In [108]:
def generate_edges(word_components: list[str]):
    edges = []
    for idx in range(len(word_components)-1):
        # {'data': {'source': str(i[0]), 'target': str(i[1])}} for i in graph.edges()]
        edges.append({
            "data": {
                "source": word_components[idx], 
                "target": word_components[idx+1],
                "class": f"word_length_{len(word_components)}"
                }
            }
        )

    return edges

In [109]:
edges = []

for word in parsed[0]:
    edges += generate_edges(word)

In [115]:
parsed.to_csv("test-parse.csv")

In [113]:
nodes = []
node_set = []

for row in parsed[0]:
    for symbol in row:
            if symbol not in nodes:
                nodes.append(symbol)
                node_set.append({"data": {"id": str(symbol), "label": str(symbol)}})

In [None]:
node_set

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("test-parse.csv")

In [11]:
filter_graph(df, "丸")

Unnamed: 0,0,1,2,3,4
0,"['睾', '丸']","['testicle', '只']",[noun],[M.W.: 只],4
1,"['肉', '丸']","['meatball', '只', '碗', '盘']",[noun],[M.W.: 只碗盘],4
2,"['药', '丸']","['pill', '片', '粒', '颗']",[noun],[M.W.: 片粒颗],4
3,"['丸', '子']","['pills', ' balls', ' meatballs', '只']",[noun],[M.W.: 只],3
4,"['定', '心', '丸']","['tranquilizer', ""something that sets one's mi...",[noun],[M.W.: 粒],2
5,"['弹', '丸']","['pellet', '粒', '块']",[noun],[M.W.: 粒块],2
6,"['樟', '脑', '丸']","['camphor balls', ' moth balls', '粒', '颗']",[noun],[M.W.: 粒颗],2
7,"['鱼', '丸']","['fish ball', '只']",[noun],[M.W.: 只],2
8,"['炸', '丸', '子']","['croquettes', ' deep fried food balls', '只', ...",[noun],[M.W.: 只碗盘],1
9,"['牛', '肉', '丸']","['beef meatballs', '只', '碗', '盘', '碟', '份']",[noun],[M.W.: 只碗盘碟份],1
