In [1]:
from nltk.corpus import wordnet as wn
import re
from collections import Counter
import pandas as pd
from plotly import express as px

In [2]:
wn.synset('man.n.10').definition()

'game equipment consisting of an object used in playing certain board games'

In [5]:
def parseResults(fn): 
    with open(fn) as f: 
        results = f.read()
    pat = r"\(([0-9]+), ([0-9]+), (.*?), Synset\('(.*?)'\)"
    matches = re.findall(pat, results)
    parsed = []
    for match in matches: 
        parI, wordI, word, syn = match
        parsed.append((int(parI), int(wordI), word, wn.synset(syn)))
    return parsed

In [10]:
def categorizeWords(data, minDepth=5, maxDepth=0):
    wordsAndCats = []
    for word, val in data.items():
        wordCat = [val, word]
        depth = minDepth
        while depth > maxDepth:
            cat = getHypernymLevelN(word, depth).name()
            wordCat.append(cat)
            depth -= 1
        wordsAndCats.append(wordCat)
    return wordsAndCats

def getHypernymLevelN(synset, n):
    while synset.min_depth() > n:
        hypernyms = synset.hypernyms()
        if len(hypernyms) > 0:
            synset = hypernyms[0]
        else:
            break
    return synset

In [39]:

def makeChart(colorWithCats, name):
    df = pd.DataFrame(colorWithCats) # columns=cols)
    #print(df)
    fig = px.treemap(df, path=df.columns[-1:0:-1],
                     values=0,
                     color=0,
                     #color_continuous_scale=getColorScale(name),
                     color_continuous_midpoint=df[0].mean(),
                     title='Breakdown of objects in ' + name
                     )
    with open(name+'.html', 'w') as f:
        f.write(fig.to_html())
    return df 

In [26]:
def percentInCat(categorized, query): 
    """ Calculate the percentage of a given category. 
    The objects in this texts are X% objects, for instance." 
    """
    return len([l for l in categorized if query in l[2:]]) / len(categorized)

In [34]:
def objectPercentages(categorized, cats=['artifact.n.01', 'living_thing.n.01', 'natural_object.n.01']): 
    out = {}
    for cat in cats: 
        percent = percentInCat(categorized, cat)
        out[cat] = percent
    return out

In [37]:
def main(fn): 
    results = parseResults(fn)
    stats = Counter([item[3] for item in results])
    categorized = categorizeWords(stats)
    return {fn: objectPercentages(categorized)}

In [None]:
if __name__ == "__main__": 
    

In [38]:
main('results/1880-BrameCharlotteM-DoraThorne-2374.0.json')

{'results/1880-BrameCharlotteM-DoraThorne-2374.0.json': {'artifact.n.01': 0.3851508120649652,
  'living_thing.n.01': 0.08004640371229699,
  'natural_object.n.01': 0.042923433874709975}}