In [48]:
from word2word import Word2word
import json
import re
import unicodedata

def clean_string(s: str) -> str:
    # matches any digit OR any non-letter character
    pattern = r"[0-9]|[^a-zA-Z]"

    if re.search(pattern, s):
        return ""
    return s

def clean_word(word: str) -> str:
    """
    Normalize and clean a single English word.

    Steps:
        - Lowercase and strip whitespace.
        - Normalize Unicode (NFKD) for consistent accents and symbols.
        - Replace smart quotes and various dash symbols.
        - Remove non-alphanumeric characters except apostrophes.

    Returns:
        str: cleaned word
    """
    w = word.lower().strip()

    w = unicodedata.normalize("NFKD", w)

    w = clean_string(w)
    return w



en2vi = Word2word("en", "vi")

english_words = list(en2vi.word2x.keys())



list_english_cleaned = list(set([clean_word(w) for w in english_words if clean_word(w)]))

list_english_sorted = sorted(list_english_cleaned)

print(len(list_english_cleaned))      # total number of English words
print(list_english_cleaned[:20])      # preview first 20


61430
['nativity', 'appliances', 'avenue', 'warlord', 'cicatrisation', 'handmade', 'affectionately', 'sankara', 'literahhy', 'muskrats', 'lihua', 'habilis', 'reexamine', 'sawyer', 'ogether', 'near', 'lonelier', 'popeyes', 'suns', 'apron']


In [49]:
from nltk.corpus import wordnet as wn

list_verbs = []
list_adverbs = []
list_adjectives = []
list_nouns = []
list_rests = []

for word in list_english_sorted:
    synsets = wn.synsets(word)

    # WordNet knows nothing about this word
    if not synsets:
        list_rests.append(word)
        continue

    pos_set = {syn.pos() for syn in synsets}

    if 'v' in pos_set:
        list_verbs.append(word)

    if 'n' in pos_set:
        list_nouns.append(word)

    if 'a' in pos_set or 's' in pos_set:
        list_adjectives.append(word)

    if 'r' in pos_set:
        list_adverbs.append(word)

    # WordNet POS exists but not useful (very rare)
    if not pos_set.intersection({'n', 'v', 'a', 's', 'r'}):
        list_rests.append(word)


In [50]:
print(len(list_verbs))
print(len(list_adverbs))
print(len(list_adjectives))
print(len(list_nouns))
print(len(list_rests))




14850
1391
7947
27004
21721


In [51]:
with open('temporarily.json', 'w', encoding='utf-8') as file:
    json.dump({
        'list_verbs': list_verbs,
        'list_adverbs': list_adverbs,
        'list_adjectives': list_adjectives,
        'list_nouns': list_nouns,
        'list_rests': list_rests,
    }, file, indent=4)