In [17]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

corpus = pd.read_csv('definizioni.csv', sep=',', engine='python')

Per ogni concetto, prendere tutte le definizioni, e calcolare la sovrapposizione lessicale tra tutte le definizioni (cioè le parole in comune). E' possibile che per un dato concetto l'intersezione delle parole sia vuota. Ad esempio, se una definizione contiene una parola che non viene usata dalle altre non verrà conteggiata. Un modo è contare le parole più ricorrenti (contando la loro frequenza) e poi vedere quanto quelle parole ci sono nelle definizioni (es. 12/30 usano una certa parola) e da qui calcoliamo uno score delle parole

In [18]:
def clear_sentence(sentence, lemmatizer = None, stemmer = None):
    """
    Data una frase la pulisce dai caratteri speciali e lettere maiscole
    e ed esegue infine la lemmatizzazione o stemmatizzazione, in base ai
    parametri specificati. Oltre a ciò viene fatto lo stopword removal, cioè
    vengono tolte parole congiunzioni, articoli, ...
    Ritorna un array di parole.
    """
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))

    if lemmatizer is None and stemmer is None:
        lemmatizer = WordNetLemmatizer()

    tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in tokens if token not in string.punctuation] #tolgo la punteggiatura
    tokens = [token.lower() for token in tokens] # sostituisco le maiuscole con le minuscole
    tokens = [token for token in tokens if token not in stop_words] # rimuovo le stop words
    if stemmer is not None:
        tokens = [stemmer.stem(token) for token in tokens] # stemmizzo
    else:
        tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatizzo
    
    return tokens
    

In [19]:
def get_tokens_frequency(sentences):
    """
    Data una lista di frasi conta le occorrenze di ogni parola e ritorna un dizionario
    con la loro frequenza ordinato dalla parola più frequente a quella meno.
    """
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer() # non usato

    sentences_number = len(sentences)
    tokens_occurrences = {}
    for i in range (1, len(sentences)):
        sentence = sentences[i]

        if not isinstance(sentence, float):
            #tokens = clear_sentence(sentence, lemmatizer, None)
            tokens = clear_sentence(sentence, None, stemmer)

            for token in tokens:
                if token in tokens_occurrences:
                    tokens_occurrences[token] += 1
                else:
                    tokens_occurrences[token] = 1
    
    tokens_frequency = {}

    for token in tokens_occurrences:
        tokens_frequency[token] = tokens_occurrences[token] / sentences_number

    # riordino le frequenze in ordine decrescente
    tokens_frequency = {k: v for k, v in sorted(tokens_frequency.items(), key=lambda item: item[1], reverse=True)}

    return tokens_frequency
    

In [20]:
#emotion definition
emotion_definitions = corpus.iloc[0]
emotion_definitions = emotion_definitions[1:] #remove the first column because it is the name of the emotion
tokens_frequency_for_emotion_definition = get_tokens_frequency(emotion_definitions)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_emotion_definition:
    #per ogni emtion defeinition
    for sentence in emotion_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(emotion_definitions)) * tokens_frequency_for_emotion_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_emotion_definition)
print(average_frequency)

#person definition
person_definitions = corpus.iloc[1]
person_definitions = person_definitions[1:] #remove the first column because it is the name of the person
tokens_frequency_for_person_definition = get_tokens_frequency(person_definitions)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_person_definition:
    #per ogni emtion defeinition
    for sentence in person_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(person_definitions)) * tokens_frequency_for_person_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_person_definition)
print(average_frequency)

#revenge definition
revenge_definitions = corpus.iloc[2]
revenge_definitions = revenge_definitions[1:] #remove the first column because it is the name of the revenge
tokens_frequency_for_revenge_definition = get_tokens_frequency(revenge_definitions)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_revenge_definition:
    #per ogni emtion defeinition
    for sentence in revenge_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(revenge_definitions)) * tokens_frequency_for_revenge_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_revenge_definition)
print(average_frequency)

#brick definition
brick_definitions = corpus.iloc[3]
brick_definitions = brick_definitions[1:] #remove the first column because it is the name of the brick
tokens_frequency_for_brick_definition = get_tokens_frequency(brick_definitions)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_brick_definition:
    #per ogni emtion defeinition
    for sentence in brick_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(brick_definitions)) * tokens_frequency_for_brick_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_brick_definition)
print(average_frequency)



0.09762794384057971
0.09722793311403509
0.179736328125
0.3218845274390244


## DA FARLO PER LE ALTRE PAROLE E POI AGGREGARE

Una volta definita la metrica per la sovrapposizione lessicale, aggreghiamo le dimensioni proposte. Cioè, prendiamo i due concetti concreti e vediamo com'è l'andamento delle parole, e idem per quelli astratti. Faremo la stessa cosa per quelli generici e quelli specifici.

Ciò viene fatto per vedere se ci sono differenze tra le dimensioni. Tipicamente, c'è molta più sovrapposizione su concetti concreti e su concetti specifici

In [21]:
#person and brick definition (aggregazione di concetti concreti)
person_definitions = corpus.iloc[1]
brick_definitions = corpus.iloc[3]
person_definitions = person_definitions[1:] #remove the first column because it is the name of the person
brick_definitions = brick_definitions[1:] #remove the first column because it is the name of the brick
person_and_brick_definitions = pd.concat([person_definitions, brick_definitions], axis=0)
tokens_frequency_for_person_and_brick_definition = get_tokens_frequency(person_and_brick_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_person_and_brick_definition, orient='index').head(10)
table.columns = ['Frequency']
print("Person and brick definition")
print(table)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_person_and_brick_definition:
    #per ogni emtion defeinition
    for sentence in person_and_brick_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(person_and_brick_definitions)) * tokens_frequency_for_person_and_brick_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_person_and_brick_definition)
print(average_frequency)

#emotion and revenge definition (aggregazioni di concetti astraggi)
emotion_definitions = corpus.iloc[0]
revenge_definitions = corpus.iloc[2]
emotion_definitions = emotion_definitions[1:] #remove the first column because it is the name of the emotion
revenge_definitions = revenge_definitions[1:] #remove the first column because it is the name of the revenge
emotion_and_revenge_definitions = pd.concat([emotion_definitions, revenge_definitions], axis=0)
tokens_frequency_for_emotion_and_revenge_definition = get_tokens_frequency(emotion_and_revenge_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_emotion_and_revenge_definition, orient='index').head(10)
table.columns = ['Frequency']
print("\nEmotion and revenge definition")
print(table)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_emotion_and_revenge_definition:
    #per ogni emtion defeinition
    for sentence in emotion_and_revenge_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(emotion_and_revenge_definitions)) * tokens_frequency_for_emotion_and_revenge_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_emotion_and_revenge_definition)
print(average_frequency)

#person and emotion definition (aggragazione di concetti generici)
person_definitions = corpus.iloc[1]
emotion_definitions = corpus.iloc[0]
person_definitions = person_definitions[1:] #remove the first column because it is the name of the person
emotion_definitions = emotion_definitions[1:] #remove the first column because it is the name of the emotion
person_and_emotion_definitions = pd.concat([person_definitions, emotion_definitions], axis=0)
tokens_frequency_for_person_and_emotion_definition = get_tokens_frequency(person_and_emotion_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_person_and_emotion_definition, orient='index').head(10)
table.columns = ['Frequency']
print("\nPerson and emotion definition")
print(table)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_person_and_emotion_definition:
    #per ogni emtion defeinition
    for sentence in person_and_emotion_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(person_and_emotion_definitions)) * tokens_frequency_for_person_and_emotion_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_person_and_emotion_definition)
print(average_frequency)

#brick and revenge definition (aggragazione di concetti specifici)
brick_definitions = corpus.iloc[3]
revenge_definitions = corpus.iloc[2]
brick_definitions = brick_definitions[1:] #remove the first column because it is the name of the brick
revenge_definitions = revenge_definitions[1:] #remove the first column because it is the name of the revenge
brick_and_revenge_definitions = pd.concat([brick_definitions, revenge_definitions], axis=0)
tokens_frequency_for_brick_and_revenge_definition = get_tokens_frequency(brick_and_revenge_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_brick_and_revenge_definition, orient='index').head(10)
table.columns = ['Frequency']
print("\nBrick and revenge definition")
print(table)
#calculate an average of the frequency of the words
average_frequency = 0
occurrences = 0
for token in tokens_frequency_for_brick_and_revenge_definition:
    #per ogni emtion defeinition
    for sentence in brick_and_revenge_definitions:
        if not isinstance(sentence, float):
            if token in sentence:
                occurrences += 1
    average_frequency += (occurrences / len(brick_and_revenge_definitions)) * tokens_frequency_for_brick_and_revenge_definition[token]
average_frequency = average_frequency / len(tokens_frequency_for_brick_and_revenge_definition)
print(average_frequency)


Person and brick definition
           Frequency
human       0.437500
use         0.375000
build       0.359375
construct   0.281250
object      0.250000
materi      0.250000
person      0.093750
made        0.093750
clay        0.093750
block       0.093750
0.09606684470663265

Emotion and revenge definition
          Frequency
feel       0.406250
someon     0.234375
someth     0.203125
emot       0.140625
anger      0.125000
human      0.109375
action     0.109375
reaction   0.109375
act        0.093750
hurt       0.093750
0.08717805195630081

Person and emotion definition
         Frequency
human     0.562500
feel      0.328125
live      0.125000
person    0.109375
someth    0.109375
certain   0.078125
anim      0.062500
state     0.062500
mental    0.062500
entiti    0.046875
0.06491142406798246

Brick and revenge definition
           Frequency
use         0.375000
build       0.343750
construct   0.265625
materi      0.234375
object      0.234375
someon      0.218750
someth      