In [9]:
import json
import pandas as pd

In [10]:
word_list = []
with open(r"word_list\spanish\es_50k.txt", encoding="utf-8") as file:
    for line in file.readlines():
        items = line.split()
        *words, count = items
        word = " ".join(words)
        word_list.append([word, int(count)])
        if "à" in word:
            print(items)

['està', '2664']
['màs', '1597']
['voilà', '919']
['estàs', '587']
['papà', '530']
['déjà', '411']
['mamà', '370']
['estàn', '348']
['serà', '307']
['quizà', '208']


In [14]:
df = pd.DataFrame(word_list)
df.columns = ["word", "count"]

# only lowercase word
df["word"] = df["word"].astype(str).apply(lambda x: x.lower())
agg_functions = {"word": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[~df["word"].str.contains("[ãçëîôõöü]")]
df = df[df["word"].str.isalpha()]

# frequent words
df = df[df["count"] > df["count"].quantile(0.1)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,count,word_len
0,a,9549646,1
1,aa,865,2
2,aaa,281,3
3,aaaah,348,5
4,aaah,848,4


In [15]:
df.dtypes

word        object
count        int64
word_len     int64
dtype: object

In [16]:
df.describe()

Unnamed: 0,count,word_len
count,44521.0,44521.0
mean,9236.393,7.536174
std,167846.4,2.38124
min,224.0,1.0
25%,367.0,6.0
50%,706.0,7.0
75%,1965.0,9.0
max,14459520.0,19.0


In [19]:
df[(df["word_len"] >= 3) & (df["word_len"] <= 5) & (df["count"] > 1)].describe()

Unnamed: 0,count,word_len
count,8771.0,8771.0
mean,19515.7,4.43165
std,203050.1,0.704585
min,224.0,3.0
25%,405.0,4.0
50%,929.0,5.0
75%,3483.0,5.0
max,14421000.0,5.0


In [21]:
df.query("'libro' in word or 'libros' in word")

Unnamed: 0,word,count,word_len
25082,libro,64445,5
25083,libros,28603,6


In [23]:
difficulties = {"easy": 0.85, "medium": 0.6, "hard": 0.4, "all": 0}
min_len = 4
max_len = 5

for name, base_percentile in difficulties.items():
    dfs_filtered: list[pd.DataFrame] = []
    lengths: list[str] = []
    for l in range(min_len, max_len + 1):
        df_len = df[df["word_len"] == l]
        # longer words need larger vocab pool
        percentile = max(base_percentile - (l - min_len) * 0.05, 0)
        cutoff = df_len["count"].quantile(percentile)
        dfs_filtered.append(
            df_len[df_len["count"] > cutoff]
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        lengths.append(f"len{l}: #{len(dfs_filtered[-1])}")
    df_filtered = pd.concat(dfs_filtered)

    print(", ".join(lengths))

    with open(rf"vocabs\es\es-{name}.json", "w") as f:
        data = df_filtered["word"].sort_values().tolist()
        json.dump(data, f)
        print(f"{f.name}: {len(df_filtered)} words")

len4: #418, len5: #978
vocabs\es\es-easy.json: 1396 words
len4: #1113, len5: #2199
vocabs\es\es-medium.json: 3312 words
len4: #1670, len5: #3176
vocabs\es\es-hard.json: 4846 words
len4: #2775, len5: #4878
vocabs\es\es-all.json: 7653 words
