In [45]:
import json
import pandas as pd
import numpy as np

In [46]:
df = pd.read_excel(
    r"word_list\japanese\NLT1.40_freq_list.xlsx",
    sheet_name="NLT 1.40頻度リスト",
)

# extract and rename columns
df = df[["レマ", "読み", "頻度"]].rename(
    columns={"レマ": "annotation", "読み": "word", "頻度": "count"}
)

# cleanse words
df = df[np.isfinite(df["count"])]
df = df.dropna()

# fix format
df["word"] = df["word"].astype(str)
df["count"] = df["count"].astype(int)
agg_functions = {"word": "first", "annotation": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.82)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,annotation,count,word_len
0,々,々,73050,1
1,っ,っ,133403,1
2,ゝ,ゝ,8109,1
3,ゞ,ゞ,3475,1
4,ア,あ,64415,1


In [47]:
df.dtypes

word          object
annotation    object
count          int32
word_len       int64
dtype: object

In [48]:
df.describe()

Unnamed: 0,count,word_len
count,13380.0,13380.0
mean,69497.8,4.053737
std,932634.5,1.262307
min,2494.0,1.0
25%,4207.0,3.0
50%,8226.0,4.0
75%,23134.25,5.0
max,57264470.0,10.0


In [49]:
df[(df["word_len"] >= 3) & (df["word_len"] <= 5) & (df["count"] > 10)].describe()

Unnamed: 0,count,word_len
count,10503.0,10503.0
mean,29899.12,3.892031
std,99538.2,0.729416
min,2494.0,3.0
25%,4200.0,3.0
50%,8121.0,4.0
75%,21939.5,4.0
max,4111651.0,5.0


In [50]:
df.query("'タビタビ' in word")

Unnamed: 0,word,annotation,count,word_len
7279,タビタビ,たびたび,11536,4


In [53]:
difficulties = {"easy": 0.89, "medium": 0.6, "hard": 0.4, "all": 0}
min_len = 4
max_len = 5

for name, base_percentile in difficulties.items():
    dfs_filtered: list[pd.DataFrame] = []
    lengths: list[str] = []
    for l in range(min_len, max_len + 1):
        df_len = df[df["word_len"] == l]
        # japanese longer words don't need larger vocab pool
        percentile = max(base_percentile - (l - min_len) * 0.15, 0)
        cutoff = df_len["count"].quantile(percentile)
        dfs_filtered.append(
            df_len[df_len["count"] > cutoff]
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        lengths.append(f"len{l}: #{len(dfs_filtered[-1])}")
    df_filtered = pd.concat(dfs_filtered)

    print(", ".join(lengths))

    with open(rf"vocabs\ja\ja-{name}.json", "w") as f:
        data = df_filtered["word"].sort_values().tolist()
        json.dump(data, f)
        print(f"{f.name}: {len(df_filtered)} words")

    if name == "all":
        with open(rf"vocabs\ja\ja-annotations.json", "w") as f:
            data = {}
            for index, row in df_filtered.iterrows():
                data[row["word"]] = row["annotation"]
            json.dump(data, f)
            print(f"{f.name}: {len(df_filtered)} annotations")

len4: #528, len5: #595
vocabs\ja\ja-easy.json: 1123 words
len4: #1917, len5: #1258
vocabs\ja\ja-medium.json: 3175 words
len4: #2876, len5: #1715
vocabs\ja\ja-hard.json: 4591 words
len4: #4791, len5: #2287
vocabs\ja\ja-all.json: 7078 words
vocabs\ja\ja-annotations.json: 7078 annotations
