In [None]:
import pandas as pd
from utils import *

In [None]:
words=pd.read_csv("russian3/words.csv")
print(words.columns)
words["type"].unique()

In [None]:
words=pd.concat([words[:100], words[words["id"]==59737]])
print(words["type"].unique())
print(len(words))

In [None]:
not_nan_list=words[~pd.isna(words["type"])]
print("Total Not NaN:", len(not_nan_list))
print("Total Not NaN (not disabled):", len(not_nan_list[not_nan_list["disabled"]==0]))
print("Total Not NaN (disabled):", len(not_nan_list[not_nan_list["disabled"]==1]))
not_nan_list=not_nan_list[not_nan_list["disabled"]==1]
print("Has Usage (disabled):", len(not_nan_list[not_nan_list["usage_en"].isna()==False]))
del not_nan_list

print()
nan_list=words[pd.isna(words["type"])]
print("Total NaN:", len(nan_list))
print("Total NaN (not disabled):", len(nan_list[nan_list["disabled"]==0]))
print("Total NaN (disabled):", len(nan_list[nan_list["disabled"]==1]))
nan_list=nan_list[nan_list["disabled"]==0]
print("Has Usage (not disabled):", len(nan_list[nan_list["usage_en"].isna()==False]))
del nan_list

In [None]:
# Disabled的词、type为NaN的词，将被剔除
words=words[~pd.isna(words["type"])]
words=words[words["disabled"]==0]
print(len(words))

In [None]:
nouns_csv=pd.read_csv("russian3/nouns.csv")
words_forms_csv=pd.read_csv("russian3/words_forms.csv")
translations_csv=pd.read_csv("russian3/translations.csv")
expressions_words_csv=pd.read_csv("russian3/expressions_words.csv")

# 剔除德语的翻译
print(len(translations_csv))
translations_csv=translations_csv[translations_csv["lang"]=="en"]
print(len(translations_csv))

In [None]:
def get_accented(derived_from_word_id: int):
    return convertStress(words[words["id"]==derived_from_word_id].iloc[0]["accented"])

def get_translations(word_id: int):
    table=translations_csv[translations_csv["word_id"]==word_id]
    translation_list=[]
    for i, row in table.iterrows():
        translation_list.append([
            row["tl"] if not pd.isna(row["tl"]) else "",
            convertStress(row["example_ru"]) if not pd.isna(row["example_ru"]) else "",
            row["example_tl"] if not pd.isna(row["example_tl"]) else "",
            row["info"] if not pd.isna(row["info"]) else "",
        ])
    return translation_list

def get_expressions(word_id: int):
    expression_id_list=expressions_words_csv[expressions_words_csv["referenced_word_id"]==word_id]["expression_id"].values.tolist()
    expression_list=[]
    for expression_id in expression_id_list:
        expression_list.append([
            get_accented(expression_id),
            "; ".join([i[0] for i in get_translations(expression_id)])
        ])
    return expression_list

def get_sentences(word_id: int):
    pass

def get_relateds(word_id: int):
    pass


In [None]:
def noun_get_info(word_id: int):
    row=nouns_csv[nouns_csv["word_id"]==word_id].iloc[0]
    info={
        "gender": row["gender"],
        "partner": convertStress(row["partner"]) if not pd.isna(row["partner"]) else "",
        "indeclinable": True if row["indeclinable"] else False,
        "animate": True if row["animate"] else False,
        "sg_only": True if row["sg_only"] else False,
        "pl_only": True if row["pl_only"] else False,
    }
    return info

def noun_get_declension(word_id: int):
    table=words_forms_csv[words_forms_csv["word_id"]==word_id]
    declension_list=["ru_noun_sg_nom","ru_noun_sg_gen","ru_noun_sg_dat","ru_noun_sg_acc","ru_noun_sg_inst","ru_noun_sg_prep","ru_noun_pl_nom","ru_noun_pl_gen","ru_noun_pl_dat","ru_noun_pl_acc","ru_noun_pl_inst","ru_noun_pl_prep"]
    declension_dict={}
    for dec in declension_list:
        form=", ".join(table[table["form_type"]==dec]["form"])
        # form_bare=", ".join(table[table["form_type"]==dec]["form_bare"])
        declension_dict[dec]=convertStress(form)
    return declension_dict

In [None]:
word_dict={}
for i,row in words.iterrows():
    word_id=row["id"]
    bare=row["bare"]
    accented=row["accented"]
    derived_from_word_id=row["derived_from_word_id"]
    rank=row["rank"]
    usage_en=row["usage_en"]
    Type=row["type"]
    
    if word_dict.get(bare)==None:
        word_dict[bare]=[]
    
    temp_dict={
        "id": word_id,
        "overview":{
            "Type": Type,
            "accented": accented,
            "derived_from_word": get_accented(derived_from_word_id),
            "rank": rank
        },
        "translations": get_translations(word_id),
        "usage": usage_en,
        "expressions": get_expressions(word_id),
        "sentences": get_sentences(word_id),
        "relateds": get_relateds(word_id),
    }

    if Type=="noun":
        temp_dict["overview"]["extra"]=noun_get_info(word_id)
        temp_dict["declension"]=noun_get_declension(word_id)
    
    word_dict[bare].append(temp_dict)