In [10]:
import json
import pandas as pd
import numpy as np

In [11]:
df = pd.read_excel(
    r"word_list\japanese\NLT1.40_freq_list.xlsx",
    sheet_name="NLT 1.40頻度リスト",
)

# extract and rename columns
df = df[["レマ", "読み", "頻度"]].rename(
    columns={"レマ": "annotation", "読み": "word", "頻度": "count"}
)

# cleanse words
df = df[np.isfinite(df["count"])]
df = df.dropna()

# fix format
df["word"] = df["word"].astype(str)
df["count"] = df["count"].astype(int)
agg_functions = {"word": "first", "annotation": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.5)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,annotation,count,word_len
0,ℓ,ℓ,564,1
1,々,々,73050,1
2,〆,〆,1170,1
3,っ,っ,133403,1
4,ゝ,ゝ,8109,1


In [12]:
df.dtypes

word          object
annotation    object
count          int32
word_len       int64
dtype: object

In [13]:
df.describe()

Unnamed: 0,count,word_len
count,37144.0,37144.0
mean,25563.08,4.421979
std,560708.2,1.347253
min,158.0,1.0
25%,480.0,4.0
50%,1291.0,4.0
75%,4833.25,5.0
max,57264470.0,13.0


In [14]:
df[(df["word_len"] >= 3) & (df["word_len"] <= 5) & (df["count"] > 10)].describe()

Unnamed: 0,count,word_len
count,28088.0,28088.0
mean,11712.86,4.00502
std,62469.41,0.728651
min,158.0,3.0
25%,511.0,3.0
50%,1398.0,4.0
75%,5135.25,5.0
max,4111651.0,5.0


In [15]:
df.query("'タビタビ' in word")

Unnamed: 0,word,annotation,count,word_len
19539,タビタビ,たびたび,11536,4


In [25]:
difficulties = {"easy": 0.96, "medium": 0.9, "hard": 0.7, "all": 0}
min_len = 4
max_len = 5

for name, base_percentile in difficulties.items():
    dfs_filtered: list[pd.DataFrame] = []
    lengths: list[str] = []
    for l in range(min_len, max_len + 1):
        df_len = df[df["word_len"] == l]
        # longer words need larger vocab pool
        percentile = max(base_percentile - (l - min_len) * 0.03, 0)
        cutoff = df_len["count"].quantile(percentile)
        dfs_filtered.append(
            df_len[df_len["count"] > cutoff]
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        lengths.append(f"len{l}: #{len(dfs_filtered[-1])}")
    df_filtered = pd.concat(dfs_filtered)

    print(", ".join(lengths))

    with open(rf"vocabs\ja\ja-{name}.json", "w") as f:
        data = df_filtered["word"].sort_values().tolist()
        json.dump(data, f)
        print(f"{f.name}: {len(df_filtered)} words")

    if name == "all":
        with open(rf"vocabs\ja\ja-annotations.json", "w") as f:
            data = {}
            for index, row in df_filtered.iterrows():
                data[row["word"]] = row["annotation"]
            json.dump(data, f)
            print(f"{f.name}: {len(df_filtered)} annotations")

len4: #527, len5: #527
vocabs\ja\ja-easy.json: 1054 words
len4: #1318, len5: #979
vocabs\ja\ja-medium.json: 2297 words
len4: #3953, len5: #2483
vocabs\ja\ja-hard.json: 6436 words
len4: #13155, len5: #7517
vocabs\ja\ja-all.json: 20672 words
vocabs\ja\ja-annotations.json: 20672 annotations
