In [1]:
import json
import pandas as pd

In [2]:
df = pd.read_csv(r"word_list\english\unigram_freq.csv")

# only lowercase word
df["word"] = df["word"].apply(lambda x: str(x).lower())
agg_functions = {"word": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.75)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,count,word_len
0,a,9081174698,1
1,aa,30523331,2
2,aaa,10243983,3
3,aaaa,1595769,4
4,aaacn,161080,5


In [3]:
df.dtypes

word        object
count        int64
word_len     int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,count,word_len
count,83333.0,83333.0
mean,6937624.0,7.07338
std,132465900.0,2.685928
min,136576.0,1.0
25%,226275.0,5.0
50%,455274.0,7.0
75%,1492255.0,9.0
max,23135850000.0,32.0


In [10]:
df.query("count > 10000 and word_len == 5").describe()

Unnamed: 0,count,word_len
count,9715.0,9715.0
mean,7001385.0,5.0
std,36013870.0,0.0
min,136577.0,5.0
25%,225764.5,5.0
50%,463108.0,5.0
75%,1793414.0,5.0
max,1226734000.0,5.0


In [6]:
df.query("'running' in word")

Unnamed: 0,word,count,word_len
64050,running,72879426,7


In [22]:
difficulties = {"easy": 0.95, "medium": 0.9, "hard": 0.7, "all": 0}
lengths = [4]

for l in lengths:
    for name, percentile in difficulties.items():
        cutoff = df["count"].quantile(percentile)
        df_filtered = (
            df.query(f"count > {cutoff} and word_len == {l}")
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        with open(rf"vocabs\en\en-len{l}-{name}.json", "w") as f:
            data = df_filtered["word"].sort_values().tolist()
            json.dump(data, f)
            print(f"{f.name}: {len(df_filtered)} words")

vocabs\en\en-len4-easy.json: 576 words
vocabs\en\en-len4-medium.json: 960 words
vocabs\en\en-len4-hard.json: 2416 words
vocabs\en\en-len4-all.json: 7862 words
