In [1]:
import json
import pandas as pd

In [2]:
df = pd.read_csv(r"word_list\english\unigram_freq.csv")

# only lowercase word
df["word"] = df["word"].apply(lambda x: str(x).lower())
agg_functions = {"word": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.75)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,count,word_len
0,a,9081174698,1
1,aa,30523331,2
2,aaa,10243983,3
3,aaaa,1595769,4
4,aaacn,161080,5


In [3]:
df.dtypes

word        object
count        int64
word_len     int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,count,word_len
count,83333.0,83333.0
mean,6937624.0,7.07338
std,132465900.0,2.685928
min,136576.0,1.0
25%,226275.0,5.0
50%,455274.0,7.0
75%,1492255.0,9.0
max,23135850000.0,32.0


In [5]:
df.query("count > 10000 and word_len >= 3 and word_len <= 5").describe()

Unnamed: 0,count,word_len
count,24372.0,24372.0
mean,10705480.0,4.11981
std,188396700.0,0.814303
min,136577.0,3.0
25%,230404.8,3.0
50%,470107.0,4.0
75%,1595851.0,5.0
max,23135850000.0,5.0


In [6]:
df[(df["word_len"] >= 3) & (df["word_len"] <= 5) & (df["count"] > 10000)].describe()

Unnamed: 0,count,word_len
count,24372.0,24372.0
mean,10705480.0,4.11981
std,188396700.0,0.814303
min,136577.0,3.0
25%,230404.8,3.0
50%,470107.0,4.0
75%,1595851.0,5.0
max,23135850000.0,5.0


In [7]:
df.query("'running' in word")

Unnamed: 0,word,count,word_len
64050,running,72879426,7


In [13]:
difficulties = {"easy": 0.95, "medium": 0.9, "hard": 0.7, "all": 0}
min_len = 4
max_len = 5

for name, base_percentile in difficulties.items():
    dfs_filtered: list[pd.DataFrame] = []
    lengths: list[str] = []
    for l in range(min_len, max_len + 1):
        df_len = df[df["word_len"] == l]
        # longer words need larger vocab pool
        percentile = max(base_percentile - (l - min_len) * 0.07, 0)
        cutoff = df_len["count"].quantile(percentile)
        dfs_filtered.append(
            df_len[df_len["count"] > cutoff]
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        lengths.append(f"len{l}: #{len(dfs_filtered[-1])}")
    df_filtered = pd.concat(dfs_filtered)

    print(", ".join(lengths))

    with open(rf"vocabs\en\en-{name}.json", "w") as f:
        data = df_filtered["word"].sort_values().tolist()
        json.dump(data, f)
        print(f"{f.name}: {len(df_filtered)} words")

len4: #394, len5: #1166
vocabs\en\en-easy.json: 1560 words
len4: #787, len5: #1652
vocabs\en\en-medium.json: 2439 words
len4: #2359, len5: #3595
vocabs\en\en-hard.json: 5954 words
len4: #7861, len5: #9714
vocabs\en\en-all.json: 17575 words
