In [1]:
import json
import pandas as pd

In [2]:
word_list = []
with open(r"word_list\spanish\spa_news_2023_100K-words.txt", encoding="utf-8") as file:
    for line in file.readlines():
        items = line.split()
        _, *words, count = items
        word = " ".join(words)
        word_list.append([word, int(count)])
        if "à" in word:
            print(items)

['16586', 'Borràs', '9']
['24633', 'Botànic', '5']
['28119', 'Adrià', '4']
['33751', 'Benicàssim', '3']
['36200', 'Sant', 'Adrià', '3']
['43739', 'Cerdà', '2']
['44105', 'Cornellà', '2']
['44396', 'Democràtic', '2']
['46295', 'Jordà', '2']
['46696', 'Laura', 'Borràs', '2']
['49738', 'Tsunami', 'Democràtic', '2']
['49766', 'Tàpies', '2']
['49937', 'Valencià', '2']
['50224', 'Xàbia', '2']
['50225', 'Xàtiva', '2']
['65460', 'Almardà', '1']
['65482', 'Almàssera', '1']
['66553', 'Artà', '1']
['66918', 'Ausiàs', '1']
['66919', 'Ausiàs', 'March', '1']
['67353', 'Bagà', '1']
['67428', 'Ballescà', '1']
['67556', 'Barcelona-Gavà', '1']
['67944', 'Benigànim', '1']
['68005', 'Berguedà', '1']
['70315', 'Castillo', 'de', 'Xàtiva', '1']
['70346', 'Catalunya', 'Ràdio', '1']
['70684', 'Ceràmica', '1']
['72447', 'Cornellà', 'del', 'Terri', '1']
['75859', 'Empordà', '1']
['75931', 'Encina,-Xàtiva-València', '1']
['75932', 'Encina-Xàtiva', '1']
['76364', 'Espinàs', '1']
['76618', 'Eulàlia', '1']
['77496',

In [3]:
df = pd.DataFrame(word_list)
df.columns = ["word", "count"]

# only lowercase word
df["word"] = df["word"].astype(str).apply(lambda x: x.lower())
agg_functions = {"word": "first", "count": "sum"}
df = df.groupby(df["word"], as_index=False).aggregate(agg_functions)

# allowed alphabets
df = df[~df["word"].str.contains("[ãçëîôõöü]")]
df = df[df["word"].str.isalpha()]

# frequent words
# df = df[df["count"] > df["count"].quantile(0.1)]

df["word_len"] = df["word"].apply(len)
df = df.sort_values(by=["word"]).reset_index(drop=True)
df.head()

Unnamed: 0,word,count,word_len
0,a,46697,1
1,aa,11,2
2,aaa,1,3
3,aalen,1,5
4,aaliya,1,6


In [4]:
df.dtypes

word        object
count        int64
word_len     int64
dtype: object

In [5]:
df.describe()

Unnamed: 0,count,word_len
count,87530.0,87530.0
mean,24.887136,8.140009
std,819.766022,2.723632
min,1.0,1.0
25%,1.0,6.0
50%,2.0,8.0
75%,5.0,10.0
max,149973.0,28.0


In [6]:
df.query("count > 1 and word_len == 4").describe()

Unnamed: 0,count,word_len
count,2213.0,2213.0
mean,74.127881,4.0
std,580.950371,0.0
min,2.0,4.0
25%,2.0,4.0
50%,5.0,4.0
75%,16.0,4.0
max,20988.0,4.0


In [7]:
df.query("'libros' in word")

Unnamed: 0,word,count,word_len
49738,libros,130,6


In [8]:
difficulties = {"easy": 0.93, "medium": 0.75, "hard": 0.5, "all": 0}
lengths = [4]

for l in lengths:
    for name, percentile in difficulties.items():
        cutoff = df["count"].quantile(percentile)
        df_filtered = (
            df.query(f"count > {cutoff} and word_len == {l}")
            .sort_values(by=["count"], ascending=False)
            .reset_index(drop=True)
        )
        with open(rf"vocabs\es\es-len{l}-{name}.json", "w") as f:
            data = df_filtered["word"].sort_values().tolist()
            json.dump(data, f)
            print(f"{f.name}: {len(df_filtered)} words")

vocabs\es\es-len4-easy.json: 386 words
vocabs\es\es-len4-medium.json: 1050 words
vocabs\es\es-len4-hard.json: 1622 words
vocabs\es\es-len4-all.json: 2213 words
