# Look at the words

We load all the words from the dictionary and count them

In [47]:
import json
import os
from glob import glob

import numpy as np
import pandas as pd
from plotly import express as px
from plotly import graph_objects as go

## Load data

In [48]:
filenames = glob(os.path.join("data", "*.json"))

words_all = {}
for name in filenames:
    with open(name, "r") as f:
        words_all.update(json.load(f))

words_df = pd.DataFrame.from_dict(words_all, orient="index")
print("# entries:", len(words_df))

# entries: 108133


In [49]:
display(words_df.head(10))

Unnamed: 0,word,wordset_id,meanings,editors,contributors,labels
a cappella,a cappella,5feb6f679a,"[{'id': '492099d426', 'def': 'without musical ...",,,
AD,AD,76c6ebfae9,"[{'id': '4c21c72afa', 'def': 'in the Christian...",,,
A.D.,A.D.,b7e9d406a0,"[{'id': 'a7482f3e30', 'def': 'in the Christian...",,,
anno Domini,anno Domini,69164a0c91,"[{'id': 'b7bebf59dd', 'def': 'in the Christian...",,,
ahorse,ahorse,125f8f20e1,"[{'id': 'b6428adf79', 'def': 'on the back of a...",,,
ahorseback,ahorseback,795c5656e2,"[{'id': '164d08dc8d', 'def': 'on the back of a...",,,
anisotropically,anisotropically,9e676e43af,"[{'id': 'da4d5a2b10', 'def': 'in an anisotropi...",,,
annoyingly,annoyingly,5088c7d529,"[{'id': '4f73c0859f', 'def': 'in an annoying m...",,,
about,about,0b26c47336,"[{'id': 'c8c0ac13b0', 'def': 'imprecise quanti...",[zellerpress],"[sabreuse, lauradhahn, lefurjah]",
all right,all right,46249e29ca,"[{'id': 'b5e43f1411', 'def': 'in a satisfactor...",,,


In [50]:
meanings_all = []
words_per_meaning = []
for mean_list, word in zip(words_df.meanings.items(), words_df.index):
    # if there is a list of "meaning"-dicts
    if isinstance(mean_list[1], list):
        for meaning in mean_list[1]:
            meanings_all.append(meaning)
            words_per_meaning.append(word)

meanings_df = pd.DataFrame.from_records(meanings_all)
meanings_df = meanings_df.assign(word=words_per_meaning)

display(meanings_df.head())


Unnamed: 0,id,def,example,speech_part,synonyms,labels,-example,word
0,492099d426,without musical accompaniment,they performed a cappella,adverb,,,,a cappella
1,0bf8d49e2e,sung without instrumental accompaniment,they sang an a cappella Mass,adjective,,,,a cappella
2,4c21c72afa,in the Christian era,,adverb,"[A.D., anno Domini]",,,AD
3,a7482f3e30,in the Christian era,,adverb,[AD],,,A.D.
4,b7bebf59dd,in the Christian era,,adverb,[AD],,,anno Domini


In [55]:
## dict of possible POS for each word

speech_parts = meanings_df.groupby("word")["speech_part"].apply(set).apply(tuple)
speech_parts = pd.DataFrame(speech_parts)

# count possible PoS per word:
speech_parts = speech_parts.assign(
    pos_counts=speech_parts.groupby("word")["speech_part"].nunique()
)
display(speech_parts.head())


Unnamed: 0_level_0,speech_part,pos_counts
word,Unnamed: 1_level_1,Unnamed: 2_level_1
#,"(noun,)",1
&,"(noun,)",1
A,"(noun,)",1
A-line,"(noun,)",1
A-list,"(noun,)",1


In [65]:
speech_parts_singles = speech_parts.filter(speech_parts.pos_counts == 1)
print("# entries with :", len(speech_parts))
print("# entries with unique PoS:", len(speech_parts_singles))  # problem!


# entries with : 108128
# entries with unique PoS: 108128


In [61]:
unique_pos_combis = speech_parts.speech_part.unique()
for x in unique_pos_combis:
    print(x)


('noun',)
('adjective',)
('adverb',)
('adjective', 'noun')
('verb',)
('phrase',)
('verb', 'adjective', 'noun')
('verb', 'adjective')
('verb', 'noun')
('verb', 'adverb', 'noun')
('adverb', 'noun')
('noun', 'article')
('adjective', 'adverb')
('adjective', 'adverb', 'noun')
('interjection',)
('preposition',)
('verb', 'adjective', 'adverb', 'noun')
('verb', 'adjective', 'adverb')
('verb', 'adverb')
('noun', 'pronoun')
('pronoun',)
('verb', 'interjection', 'noun')
('phrase', 'adverb')
('verb', 'pronoun')
('preposition', 'adverb')
