In [122]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

def clear_sentence(sentence, lemmatizer = None, stemmer = None):
    """
    Data una frase la pulisce dai caratteri speciali e lettere maiscole
    e ed esegue infine la lemmatizzazione o stemmatizzazione, in base ai
    parametri specificati. Oltre a ciò viene fatto lo stopword removal, cioè
    vengono tolte parole congiunzioni, articoli, ...
    Ritorna un array di parole.
    """
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))

    if lemmatizer is None and stemmer is None:
        lemmatizer = WordNetLemmatizer()

    tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in tokens if token not in string.punctuation] #tolgo la punteggiatura
    tokens = [token.lower() for token in tokens] # sostituisco le maiuscole con le minuscole
    tokens = [token for token in tokens if token not in stop_words] # rimuovo le stop words
    if stemmer is not None:
        tokens = [stemmer.stem(token) for token in tokens] # stemmizzo
    else:
        tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatizzo
    
    return tokens
    
def get_tokens_frequency(sentences):
    """
    Data una lista di frasi conta le occorrenze di ogni parola e ritorna un dizionario
    con la loro frequenza ordinato dalla parola più frequente a quella meno.
    """
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer() # non usato

    sentences_number = len(sentences)
    tokens_occurrences = {}
    for i in range (1, len(sentences)):
        sentence = sentences[i]

        if not isinstance(sentence, float):
            tokens = clear_sentence(sentence, lemmatizer, None)
            #tokens = clear_sentence(sentence, None, stemmer)

            for token in tokens:
                if token in tokens_occurrences:
                    tokens_occurrences[token] += 1
                else:
                    tokens_occurrences[token] = 1
    
    tokens_frequency = {}

    for token in tokens_occurrences:
        tokens_frequency[token] = tokens_occurrences[token] / sentences_number

    # riordino le frequenze in ordine decrescente
    tokens_frequency = {k: v for k, v in sorted(tokens_frequency.items(), key=lambda item: item[1], reverse=True)}

    return tokens_frequency

def get_hypoyms(token):
    """
    Dato un token ritorna una lista di iponimi
    """
    synonyms = wordnet.synsets(token)

    hyponyms = []
    for syn in synonyms:
        hyponyms += syn.hyponyms()

    hyponyms = [syn.lemmas()[0].name() for syn in hyponyms]

    return hyponyms

def get_hypernyms(token):
    """
    Dato un token ritorna una lista di iperonimi
    """
    synonyms = wordnet.synsets(token)

    hypernyms = []
    for syn in synonyms:
        hypernyms += syn.hypernyms()

    hypernyms = [syn.lemmas()[0].name() for syn in hypernyms]

    return hypernyms


In [123]:
corpus = pd.read_csv('definizioni.csv', sep=',', engine='python')

#emotion
emotion_definitions = corpus.iloc[3]
emotion_definitions = emotion_definitions.dropna()

emotion_tokens_frequency = get_tokens_frequency(emotion_definitions)

print(emotion_tokens_frequency)
emotion_tokens_frequency = list(emotion_tokens_frequency.keys())


{'used': 0.75, 'object': 0.5, 'material': 0.5, 'construction': 0.5, 'build': 0.40625, 'building': 0.3125, 'made': 0.1875, 'clay': 0.1875, 'block': 0.1875, 'something': 0.15625, 'usually': 0.125, 'brick': 0.09375, 'house': 0.09375, 'constructing': 0.0625, 'element': 0.0625, 'like': 0.0625, 'piece': 0.0625, 'shape': 0.0625, 'wall': 0.0625, 'rectangular-shaped': 0.0625, 'e.g': 0.03125, 'aim': 0.03125, '’': 0.03125, 'basic': 0.03125, 'parallelepiped': 0.03125, 'tool': 0.03125, 'resistnat': 0.03125, 'polygonal': 0.03125, 'different': 0.03125, 'size': 0.03125, 'red': 0.03125, 'e.i': 0.03125, 'generally': 0.03125, 'cunstruction': 0.03125, 'e': 0.03125, 'phyisical': 0.03125, 'physical': 0.03125, 'well': 0.03125, 'eg': 0.03125, 'costruction': 0.03125, 'part': 0.03125, 'thing': 0.03125, 'structure': 0.03125, 'rectangular': 0.03125, 'ceramic': 0.03125}


In [129]:
hyponyms1 = get_hypoyms('build')
print(hyponyms1)



['body_type', 'dumpiness', 'lankiness', 'adult_body', 'female_body', 'juvenile_body', 'male_body', 'person', 'cantilever', 'channelize', 'corduroy', 'customize', 'dry-wall', 'frame', 'groin', 'lock', 'raise', 'rebuild', 'revet', 'wattle']


In [125]:
hyponyms = []
for i in range(0, 4): #len(emotion_tokens_frequency) - 1):
    token = emotion_tokens_frequency[i]
    #pos = nltk.pos_tag([token])[0][1][0].lower()

    #print('\ntoken: ', token)
    #print('pos: ', pos, '\n')

    hyponyms += get_hypoyms(token)
    #hyponyms = [syn.lemmas()[0].name() for syn in hyponyms if syn.pos() == pos]

hyponyms



['address',
 'avail',
 'cannibalize',
 'enjoy',
 'exert',
 'exploit',
 'exploit',
 'fall_back',
 'give',
 'implement',
 'misapply',
 'overuse',
 'play',
 'play',
 'ply',
 'pull_out_all_the_stops',
 'put',
 'recycle',
 'share',
 'strain',
 'take',
 'waste',
 'work',
 'board',
 'drink',
 'abuse',
 'pervert',
 'spare',
 'take',
 'waste',
 'trespass',
 'follow',
 'catch',
 'charm',
 'commemorative',
 'curio',
 'discard',
 'draw',
 'film',
 'finding',
 'floater',
 'fomite',
 'geological_formation',
 'growth',
 'hail',
 'head',
 'hoodoo',
 'ice',
 'je_ne_sais_quoi',
 'keepsake',
 'land',
 'land',
 'location',
 'makeweight',
 'moon',
 'neighbor',
 'paring',
 'part',
 'property',
 'remains',
 'ribbon',
 'shiner',
 'snake',
 'stuff',
 'triviality',
 'vagabond',
 'wall',
 'web',
 'whole',
 'business',
 'grail',
 'point',
 'thing',
 'direct_object',
 'indirect_object',
 'prepositional_object',
 'retained_object',
 'antipathy',
 'bugbear',
 'center',
 'execration',
 'hallucination',
 'infatuation'