# Content to Form

In [2]:
import nltk
import pandas
from nltk.corpus import wordnet as wn
from heapq import heappush, heappop

## Loading

In [3]:
file = pandas.read_excel('res/content-to-form.xlsx')

## Preprocessing

In [4]:
columns = file.columns
stopwords = nltk.corpus.stopwords.words('english')
for w in ['just', 'will']:
    stopwords.remove(w)
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stemmer = nltk.stem.PorterStemmer()
defs = []
for i in range(1, len(columns)):
    column = [d for d in file[columns[i]].values if d==d]
    definition = set()
    for i in range(len(column)):
        definition.update([stemmer.stem(w.lower()) for w in tokenizer.tokenize(column[i]) if not w.lower() in stopwords])
    defs.append(definition)

## Similarity

In [5]:
def sim(a, b):
    return len(a & b) / min(len(a), len(b))

In [6]:
def syn_set(synset):
    bag = set()
    bag.update([stemmer.stem(w.lower()) for w in tokenizer.tokenize(synset.definition()) if not w.lower() in stopwords])
    for example in synset.examples():
        bag.update([stemmer.stem(w.lower()) for w in tokenizer.tokenize(example) if not w.lower() in stopwords])
    for lemma in synset.lemmas():
        bag.update([stemmer.stem(w.lower()) for w in tokenizer.tokenize(lemma.name().replace('_', ' ')) if not w.lower() in stopwords])
    return bag

## Algorithm

In [7]:
def content_to_form(definition):
    h = [] # heap
    i = 0 # not improving steps
    max_steps = 10000 # maximum not improving steps
    best_syn = wn.synset('entity.n.01')
    max_score = sim(definition, syn_set(best_syn))
    heappush(h, (-max_score, best_syn))

    while i < max_steps and h:
        (_, syn) = heappop(h)
        for hypo in syn.hyponyms():
            score = sim(definition, syn_set(hypo))
            heappush(h, (-score, hypo))
            if(score > max_score):
                best_syn = hypo
                max_score = score
                i = -1
        i += 1

    return (best_syn, max_score)

In [8]:
for definition in defs:
    print(content_to_form(definition))

(Synset('justice.n.01'), 0.75)
(Synset('hard_times.n.01'), 1.0)
(Synset('desire.n.03'), 1.0)
(Synset('regulation.n.03'), 0.75)
(Synset('animal_order.n.01'), 1.0)
(Synset('heater.n.01'), 0.75)
(Synset('landing.n.02'), 0.8571428571428571)
(Synset('slice.n.05'), 0.8333333333333334)
