In [36]:
import json
import pandas as pd
import numpy as np

In [37]:
df = pd.read_excel(
    r"word_list\japanese\NLT1.40_freq_list.xlsx",
    sheet_name="NLT 1.40頻度リスト",
)

# extract and rename columns
df = df[["レマ", "読み", "頻度"]].rename(
    columns={"レマ": "annotations", "読み": "word", "頻度": "count"}
)

# cleanse words
df = df[np.isfinite(df["count"])]
df = df.dropna()

# fix format
df["word"] = df["word"].astype(str)
df["count"] = df["count"].astype(int)
agg_functions = {"word": "first", "annotations": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.5)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,annotations,count,word_len
0,ℓ,ℓ,564,1
1,々,々,73050,1
2,〆,〆,1170,1
3,っ,っ,133403,1
4,ゝ,ゝ,8109,1


In [38]:
df.dtypes

word           object
annotations    object
count           int32
word_len        int64
dtype: object

In [39]:
df.describe()

Unnamed: 0,count,word_len
count,37144.0,37144.0
mean,25563.08,4.421979
std,560708.2,1.347253
min,158.0,1.0
25%,480.0,4.0
50%,1291.0,4.0
75%,4833.25,5.0
max,57264470.0,13.0


In [40]:
df.query("count > 10 and word_len == 3").describe()

Unnamed: 0,count,word_len
count,7386.0,7386.0
mean,20095.74,3.0
std,102387.8,0.0
min,158.0,3.0
25%,679.0,3.0
50%,2097.0,3.0
75%,8597.5,3.0
max,4111651.0,3.0


In [41]:
df.query("'タビタビ' in word")

Unnamed: 0,word,annotations,count,word_len
19539,タビタビ,たびたび,11536,4


In [44]:
difficulties = {"easy": 0.96, "medium": 0.9, "hard": 0.7, "all": 0}
lengths = [4]

for l in lengths:
    for name, percentile in difficulties.items():
        cutoff = df["count"].quantile(percentile)
        df_filtered = (
            df.query(f"count > {cutoff} and word_len == {l}")
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        with open(rf"vocabs\ja\ja-len{l}-{name}.json", "w") as f:
            data = df_filtered["word"].sort_values().tolist()
            json.dump(data, f)
            print(f"{f.name}: {len(df_filtered)} words")

        if name == "all":
            with open(rf"vocabs\ja\ja-len{l}-annotations.json", "w") as f:
                data = {}
                for index, row in df_filtered.iterrows():
                    data[row["word"]] = row["annotations"]
                json.dump(data, f)
                print(f"{f.name}: {len(df_filtered)} annotations")

vocabs\ja\ja-len4-easy.json: 475 words
vocabs\ja\ja-len4-medium.json: 1242 words
vocabs\ja\ja-len4-hard.json: 3979 words
vocabs\ja\ja-len4-all.json: 13155 words
vocabs\ja\ja-len4-annotations.json: 13155 annotations
