In [None]:
import pdfplumber
import pandas as pd
import re


# Extract definitions from pdf

In [None]:
raw_definitions = []

pdf_path = "data/hsk2.pdf"

temp_item = ""

with pdfplumber.open(pdf_path) as doc:
    for i, page in enumerate(doc.pages):      
        tables = page.find_tables(table_settings={
    "horizontal_strategy": "text"
})

        if tables:            
            table_data = tables[0].extract()
            if table_data:
                for row in table_data:
                    clean_row = [str(cell) if cell is not None else "" for cell in row]
                    line = "".join(clean_row).strip()
                    if len(line) < 1: continue

                    if line[0].isdigit():
                        raw_definitions.append(temp_item)
                        temp_item = ""
                    
                    temp_item += (' ' + line)
                         
raw_definitions = [x.strip() for x in raw_definitions if x[1].isdigit()]

In [125]:
definitions = []
for raw_definition in raw_definitions:
    splitted_defintion = raw_definition.strip().replace('\n', ' ').split(" ")
    definition = {
        'index': int(splitted_defintion[0]),
        'characters': splitted_defintion[1],
        'pinyin': splitted_defintion[2],
        'definition': " ".join(splitted_defintion[3:])
    }
    definitions.append(definition)

definitions_df = pd.DataFrame(definitions)

# Fix the entries that with missing characters because of formatting

In [126]:
def has_missing_characters(text: str) -> bool:
    return not(not(re.search(r"\(cid:\d+\)", text)))

is_missing_chars = definitions_df['characters'].apply(has_missing_characters) | definitions_df['definition'].apply(has_missing_characters)
definitions_missing_chars = definitions_df[is_missing_chars]
definitions_df = definitions_df[~is_missing_chars]

In [None]:
from gemini import fix_missing_characters

# send batches to gemini
n = 20  
fixed_definitions_list = []
number_batch = 1

for start in range(0, len(definitions_missing_chars), n):
    batch = definitions_missing_chars.iloc[start : start + n]
    fixed_batched = fix_missing_characters(batch[:20].to_dict('records'))

    fixed_definitions_list.append(pd.DataFrame(fixed_batched))

    print("Finished batch: ", number_batch)
    number_batch+=1

Finished batch:  1
Finished batch:  2
Finished batch:  3
Finished batch:  4
Finished batch:  5


In [133]:
df_english_definitions = pd.concat([definitions_df, *fixed_definitions_list], ignore_index=True)
df_english_definitions.sort_values('index', ascending=True).reset_index(drop=True)

Unnamed: 0,index,characters,pinyin,definition
0,1,爱,ài,to love; affection; to be fond of; to like
1,2,八,bā,eight; 8
2,3,爸爸,bàba,"(informal) father; CL:個|个[gè],位[wèi]"
3,4,吧,ba,(modal particle indicating polite suggestion);...
4,5,白,bái,white; snowy; pure; bright; empty; blank; plai...
...,...,...,...,...
296,295,走,zǒu,to walk; to go; to run; to move (of vehicle); ...
297,296,最,zuì,most; the most; -est
298,297,昨天,zuótiān,yesterday
299,298,左边,zuǒbian,left; the left side; to the left of


# Save data

In [134]:
df_english_definitions.to_parquet('data/english_definitions.parquet')