# Esercitazione 2

Sempre partendo dai dati sulle definizioni, si richiede di provare a costruire un sistema che utilizzi la molteplicità delle definizioni per risalire al termine "target" in maniera automatica. Non si richiede di "indovinare" ogni termine, ma di avvicinarsi (almeno semanticamente) alla risposta. Provare più soluzioni, includendo meccanismi di filtro delle definizioni (ad es. escludendo quelle meno informative o con caratteristiche particolari), di ricerca nell'albero tassonomico di WordNet (provando a partire da candidati "genus", secondo il principio Genus-Differentia), ecc.

In [13]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

corpus = pd.read_csv('definizioni.tsv', sep='\t', engine='python')
corpus.head(5)

Unnamed: 0,door,ladybug,pain,blurriness
0,"A construction used to divide two rooms, tempo...","small flying insect, typically red with black ...",A feeling of physical or mental distress,sight out of focus
1,"It's an opening, it can be opened or closed.","It is an insect, it has wings, red with black ...","It is a feeling, physical or emotional. It is ...","It is the absence of definite borders, shapele..."
2,"An object that divide two room, closing an hol...",An insect that can fly. It has red or orange c...,A felling that couscious beings can experince ...,A sensation felt when you can't see clearly th...
3,Usable for access from one area to another,Small insect with a red back,Concept that describes a suffering living being,Lack of sharpness
4,Structure that delimits an area and allows acc...,Small round flying insect,Feeling of physical discomfort,Characteristic of lack of clarity or precision


In [14]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clear_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in tokens if token not in string.punctuation] #tolgo la punteggiatura
    tokens = [token.lower() for token in tokens] # sostituisco le maiuscole con le minuscole
    tokens = [token for token in tokens if token not in stop_words] # rimuovo le stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatizzo
    
    return tokens
    
def get_hypoyms(token):
    """
    Dato un token ritorna una lista di iponimi
    """
    synonyms = wordnet.synsets(token)

    hyponyms = []
    for syn in synonyms:
        hyponyms += syn.hyponyms()

    hyponyms = [syn.lemmas()[0].name() for syn in hyponyms]

    return hyponyms

def get_hypernyms(token):
    """
    Dato un token ritorna una lista di iperonimi
    """
    synonyms = wordnet.synsets(token)

    hypernyms = []
    for syn in synonyms:
        hypernyms += syn.hypernyms()

    hypernyms = [syn.lemmas()[0].name() for syn in hypernyms]

    return hypernyms


In [15]:
corpus['door'] = corpus['door'].apply(clear_sentence)
corpus['ladybug'] = corpus['ladybug'].apply(clear_sentence)
corpus['pain'] = corpus['pain'].apply(clear_sentence)
corpus['blurriness'] = corpus['blurriness'].apply(clear_sentence)

In [78]:
def get_geneus(definitions):
    words = []
    for definition in definitions:
        words += definition
    return nltk.FreqDist(words).most_common(1)[0][0]

def get_context(definitions):
    context =  definitions.apply(lambda x: ' '.join(x))
    context = ' '.join(context)
    context = set(nltk.word_tokenize(context))
    return context

def search_synset(level, best_sense, current_syn, max_overlap, context):
    if level == 0:
        return [best_sense, max_overlap]
    else:
        for syn in current_syn.hyponyms():
            print(syn)
            signature = set(clear_sentence(syn.definition())).union(clear_sentence(' ' .join(syn.examples())))
            overlap = len(context.intersection(signature))
            if overlap >= max_overlap:
                max_overlap = overlap
                best_sense = syn
                #print(max_overlap, best_sense)
        return search_synset(level - 1, best_sense, current_syn, max_overlap, context)

def get_token(definitions, geneus):
    context = get_context(definitions)

    best_sense = None
    max_overlap = 0
    for syn_geneus in wordnet.synsets(geneus):    
        best_sense, max_overlap = search_synset(3, best_sense, syn_geneus, max_overlap, context)

    return best_sense


geneus = get_geneus(corpus['ladybug'])
print(geneus)
print(get_token(corpus['ladybug'], geneus))
        

insect
Synset('beetle.n.01')
Synset('bird_louse.n.01')
Synset('bug.n.01')
Synset('collembolan.n.01')
Synset('defoliator.n.01')
Synset('dictyopterous_insect.n.01')
Synset('dipterous_insect.n.01')
Synset('earwig.n.01')
Synset('ephemerid.n.01')
Synset('ephemeron.n.01')
Synset('flea.n.01')
Synset('gallfly.n.03')
Synset('hemipterous_insect.n.01')
Synset('heteropterous_insect.n.01')
Synset('holometabola.n.01')
Synset('homopterous_insect.n.01')
Synset('hymenopterous_insect.n.01')
Synset('imago.n.02')
Synset('leaf_miner.n.01')
Synset('lepidopterous_insect.n.01')
Synset('louse.n.01')
Synset('mecopteran.n.01')
Synset('neuropteron.n.01')
Synset('odonate.n.01')
Synset('orthopterous_insect.n.01')
Synset('phasmid.n.01')
Synset('pollinator.n.01')
Synset('proturan.n.01')
Synset('psocopterous_insect.n.01')
Synset('pupa.n.01')
Synset('queen.n.01')
Synset('social_insect.n.01')
Synset('stonefly.n.01')
Synset('termite.n.01')
Synset('thysanopter.n.01')
Synset('thysanuran_insect.n.01')
Synset('trichopterous_