In [1]:
import json
import pandas as pd

word_list = []
with open(r"word_list\spanish\es_wordlist.json") as file:
    json_data = json.load(file)
    for d in json_data.items():
        word_list.append([d[0], d[1]])

In [23]:
df = pd.DataFrame(word_list)
df.columns = ["word", "count"]

# only lowercase word
df["word"] = df["word"].astype(str).apply(lambda x: x.lower())
agg_functions = {"word": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[~df["word"].str.contains("[ãçëîôõöü]")]
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.90)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,count,word_len
0,a,22256821,1
1,aa,2017,2
2,aaa,1331,3
3,aaaron,31,6
4,aaarp,29,5


In [24]:
df.dtypes

word        object
count        int64
word_len     int64
dtype: object

In [25]:
df.describe()

Unnamed: 0,count,word_len
count,211918.0,211918.0
mean,4689.581,8.198138
std,248254.1,3.056647
min,27.0,1.0
25%,47.0,6.0
50%,106.0,8.0
75%,422.0,10.0
max,75342590.0,62.0


In [31]:
df.query("count > 10 and word_len == 4").describe()

Unnamed: 0,count,word_len
count,11159.0,11159.0
mean,6332.42,4.0
std,101555.3,0.0
min,27.0,4.0
25%,48.0,4.0
50%,109.0,4.0
75%,410.5,4.0
max,7993370.0,4.0


In [32]:
df.query("'libros' in word")

Unnamed: 0,word,count,word_len
120229,libros,27544,6


In [35]:
difficulties = {"easy": 0.965, "medium": 0.9, "hard": 0.7, "all": 0}
lengths = [4]

for l in lengths:
    for name, percentile in difficulties.items():
        cutoff = df["count"].quantile(percentile)
        df_filtered = (
            df.query(f"count > {cutoff} and word_len == {l}")
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        with open(rf"vocabs\es\es-len{l}-{name}.json", "w") as f:
            data = df_filtered["word"].sort_values().tolist()
            json.dump(data, f)
            print(f"{f.name}: {len(df_filtered)} words")

vocabs\es\es-len4-easy.json: 438 words
vocabs\es\es-len4-medium.json: 1148 words
vocabs\es\es-len4-hard.json: 3312 words
vocabs\es\es-len4-all.json: 10978 words
