In [19]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.stem import WordNetLemmatizer

corpus = pd.read_csv('definizioni.csv', sep=',', engine='python')

corpus

Unnamed: 0.1,Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,...,P23,P24,P25,P27,P29,P30,P31,P34,P35,GPT3
0,Emotion,Range of concepts human beings feel in certain...,Something you can feel,Something that an animal can feel,something you think that makes you feel good o...,Human sensation arising from the form of feelings,State of mind that a living being can percieve,Feeling that a human or an animal can express ...,A sentiment that a living entity can feel and ...,what you feel in a certain moment,...,feeling of a living being,feeling,A particular state of mind of a person caused ...,A mental reaction,a strong feeling,Feeling/State felt by living beings,The mental state of an agent,Mental mood,a feeling dued to something,Emotion is a feeling that is created by a phys...
1,Person,Human being,Human being,"The generic for person is human, it describe a...",human,Living human being belonging to a group or soc...,Mammal descending from apes,"Living entity, human being, sentient.",A human being.,human being u can see or touch,...,human,human being,An individual of the homo sapiens species,A human being,a human being,"Human being, man or woman",A specific human being.,Human being,a living entity,There is no single answer to this question as ...
2,Revenge,"Feeling, usually negative. Consequence of a wr...",An emotion that describes anger,"Is classified as an emotion, generally not goo...",negative emotion,Reaction of anger or violence on oneself or on...,Act performed by a living being triggered by e...,Act or feeling of anger resulting from another...,Act of doing something to someone because of a...,"action against something or someone, generally...",...,emotion caused by and towards someone,angry reaction towards someone,A person's will to get back at someone for som...,The mental reaction of getting something to ma...,the act of damaging someone in return of an in...,The wish to damage someone who hurt you.,The act of damaging someone as a reaction,Mood,desire to do justice for themselves,Revenge is a type of justice that is often see...
3,Brick,"object made of a material (e.g. clay), usually...","block of some material, used in construction",It’s an object and is the basic element of a c...,piece of a material used to build something,Parallelepiped object used for the constructio...,Construction tool of resistnat material and wi...,"Material used for construction of buildings, c...",An object used to build walls.,Red object used to build constructions,...,material made of clay used to build a structure,piece of material used to build something,An object usually used for constructing buildi...,A rectangular-shaped object made of clay used ...,material used for construction,,Material used for construction.,An object made to build something,object used to build something,A brick is a rectangular block of ceramic mate...


Per ogni concetto, prendere tutte le definizioni, e calcolare la sovrapposizione lessicale tra tutte le definizioni (cioè le parole in comune). E' possibile che per un dato concetto l'intersezione delle parole sia vuota. Ad esempio, se una definizione contiene una parola che non viene usata dalle altre non verrà conteggiata. Un modo è contare le parole più ricorrenti (contando la loro frequenza) e poi vedere quanto quelle parole ci sono nelle definizioni (es. 12/30 usano una certa parola) e da qui calcoliamo uno score delle parole

In [20]:
def clear_sentence(sentence, lemmatizer = None):
    """
    Data una frase la pulisce dai caratteri speciali e lettere maiscole
    e ed esegue infine la lemmatizzazione.
    Ritorna un array di parole.
    """
    if lemmatizer is None:
        lemmatizer = WordNetLemmatizer()

    tokens = nltk.word_tokenize(sentence)
    tokens = [token for token in tokens if token not in string.punctuation] #tolgo la punteggiatura
    tokens = [token.lower() for token in tokens] # sostituisco le maiuscole con le minuscole
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatizzo
    
    return tokens
    

In [21]:
def get_tokens_frequency(sentences):
    """
    Data una lista di frasi conta le occorrenze di ogni parola e ritorna un dizionario
    con la loro frequenza ordinato dalla parola più frequente a quella meno.
    """
    lemmatizer = WordNetLemmatizer()

    sentences_number = len(sentences)
    tokens_occurrences = {}
    for i in range (1, len(sentences)):
        sentence = sentences[i]

        if not isinstance(sentence, float):
            tokens = clear_sentence(sentence, lemmatizer)

            for token in tokens:
                if token in tokens_occurrences:
                    tokens_occurrences[token] += 1
                else:
                    tokens_occurrences[token] = 1
    
    tokens_frequency = {}

    for token in tokens_occurrences:
        tokens_frequency[token] = tokens_occurrences[token] / sentences_number

    tokens_frequency = {k: v for k, v in sorted(tokens_frequency.items(), key=lambda item: item[1], reverse=True)}

    return tokens_frequency
    

In [22]:
#emotion definition
emotion_definitions = corpus.iloc[0]
emotion_definitions = emotion_definitions[1:] #remove the first column because it is the name of the emotion
tokens_frequency_for_emotion_definition = get_tokens_frequency(emotion_definitions)

#person definition
person_definitions = corpus.iloc[1]
person_definitions = person_definitions[1:] #remove the first column because it is the name of the person
tokens_frequency_for_person_definition = get_tokens_frequency(person_definitions)

#revenge definition
revenge_definitions = corpus.iloc[2]
revenge_definitions = revenge_definitions[1:] #remove the first column because it is the name of the revenge
tokens_frequency_for_revenge_definition = get_tokens_frequency(revenge_definitions)

#brick definition
brick_definitions = corpus.iloc[3]
brick_definitions = brick_definitions[1:] #remove the first column because it is the name of the brick
tokens_frequency_for_brick_definition = get_tokens_frequency(brick_definitions)

## DA FARLO PER LE ALTRE PAROLE E POI AGGREGARE

Una volta definita la metrica per la sovrapposizione lessicale, aggreghiamo le dimensioni proposte. Cioè, prendiamo i due concetti concreti e vediamo com'è l'andamento delle parole, e idem per quelli astratti. Faremo la stessa cosa per quelli generici e quelli specifici.

Ciò viene fatto per vedere se ci sono differenze tra le dimensioni. Tipicamente, c'è molta più sovrapposizione su concetti concreti e su concetti specifici

In [34]:
#person and brick definition (aggregazione di concetti concreti)
person_definitions = corpus.iloc[1]
brick_definitions = corpus.iloc[3]
person_definitions = person_definitions[1:] #remove the first column because it is the name of the person
brick_definitions = brick_definitions[1:] #remove the first column because it is the name of the brick
person_and_brick_definitions = pd.concat([person_definitions, brick_definitions], axis=0)
tokens_frequency_for_person_and_brick_definition = get_tokens_frequency(person_and_brick_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_person_and_brick_definition, orient='index').head(10)
table.columns = ['Frequency']
print("Person and brick definition")
print(table)

#emotion and revenge definition (aggregazioni di concetti astraggi)
emotion_definitions = corpus.iloc[0]
revenge_definitions = corpus.iloc[2]
emotion_definitions = emotion_definitions[1:] #remove the first column because it is the name of the emotion
revenge_definitions = revenge_definitions[1:] #remove the first column because it is the name of the revenge
emotion_and_revenge_definitions = pd.concat([emotion_definitions, revenge_definitions], axis=0)
tokens_frequency_for_emotion_and_revenge_definition = get_tokens_frequency(emotion_and_revenge_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_emotion_and_revenge_definition, orient='index').head(10)
table.columns = ['Frequency']
print("\nEmotion and revenge definition")
print(table)

#person and emotion definition (aggragazione di concetti generici)
person_definitions = corpus.iloc[1]
emotion_definitions = corpus.iloc[0]
person_definitions = person_definitions[1:] #remove the first column because it is the name of the person
emotion_definitions = emotion_definitions[1:] #remove the first column because it is the name of the emotion
person_and_emotion_definitions = pd.concat([person_definitions, emotion_definitions], axis=0)
tokens_frequency_for_person_and_emotion_definition = get_tokens_frequency(person_and_emotion_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_person_and_emotion_definition, orient='index').head(10)
table.columns = ['Frequency']
print("\nPerson and emotion definition")
print(table)

#brick and revenge definition (aggragazione di concetti specifici)
brick_definitions = corpus.iloc[3]
revenge_definitions = corpus.iloc[2]
brick_definitions = brick_definitions[1:] #remove the first column because it is the name of the brick
revenge_definitions = revenge_definitions[1:] #remove the first column because it is the name of the revenge
brick_and_revenge_definitions = pd.concat([brick_definitions, revenge_definitions], axis=0)
tokens_frequency_for_brick_and_revenge_definition = get_tokens_frequency(brick_and_revenge_definitions)
#show in a table the 10 most frequent words
table = pd.DataFrame.from_dict(tokens_frequency_for_brick_and_revenge_definition, orient='index').head(10)
table.columns = ['Frequency']
print("\nBrick and revenge definition")
print(table)

Person and brick definition
              Frequency
a              0.468750
human          0.437500
of             0.375000
used           0.375000
being          0.343750
to             0.296875
object         0.250000
material       0.250000
construction   0.250000
build          0.203125

Emotion and revenge definition
           Frequency
a           0.640625
of          0.421875
feeling     0.296875
someone     0.234375
something   0.203125
that        0.203125
an          0.203125
or          0.203125
to          0.187500
the         0.156250

Person and emotion definition
         Frequency
a         0.578125
human     0.562500
being     0.468750
feeling   0.203125
is        0.187500
that      0.171875
can       0.156250
of        0.156250
the       0.140625
to        0.140625

Brick and revenge definition
              Frequency
of             0.625000
a              0.515625
used           0.375000
to             0.343750
construction   0.250000
material       0.234375
object 