In [1]:
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import csv

In [2]:
def read_rows_csv(csv_file):
    for rows in csv_file:
        return rows

In [3]:
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]
    
#uso il tokenize_sentence perche poi mi servira' in futuro per la gestione delle frasi, con la relativa ricerca della parola piu frequente in una frase
def tokenize_sentences(sentence):
    word_list= []
    lemma = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if(tag[1][:2] == 'NN'):
            word_list.append(lemma.lemmatize(tag[0], pos = wn.NOUN))
        elif(tag[1][:2] == 'VB'):
            word_list.append(lemma.lemmatize(tag[0], pos = wn.VERB))
        elif(tag[1][:2] == 'RB'):
            word_list.append(lemma.lemmatize(tag[0], pos = wn.ADV))
        elif(tag[1][:2] == 'JJ'):
            word_list.append(lemma.lemmatize(tag[0], pos = wn.ADJ))
    
    return word_list

def pre_processing (sentence):
    return remove_stopwords(tokenize_sentences(remove_punctuation(sentence)))

In [4]:
def definitions(file):

    readCSV = csv.reader(file, delimiter=',')
    
    # get list of words to analize
    words = read_rows_csv(readCSV)[1:]

    definitions_words = dict()

    for row in readCSV:
        for index, definition in enumerate(row):
            # controllo se e' presente o no una definizione
            if definition:  
                if index > 0:
                    word = words[index - 1]
                    if word not in definitions_words.keys():
                        definitions_words[word] = [pre_processing(definition)]
                    else:
                        definitions_words[word].append(pre_processing(definition))
    return definitions_words

In [5]:
#la formula per calcolare la distanza del coseno e': Cos(x, y) = x . y / ||x|| * ||y||
def cosine_similarity(definition1, defintion2):
    vector1=[]
    vector2=[]
    
    total_def = (set(definition1) | set(defintion2)) #creo un insieme con sia definition 1 che definition 2

#creo il vettore
    for w in total_def:
        if w in definition1:
            vector1.append(1)
        else:
            vector1.append(0)
        
        if w in defintion2:
            vector2.append(1)
        else:
            vector2.append(0)
    
    c = 0

    #cosine formula 
    for i in range(len(total_def)):
        c += vector1[i] * vector2[i]
    
    cosine = c / float((sum(vector1) * sum(vector2)) ** 0.5)

    #print("similarity :", cosine)

    return cosine    

In [6]:
def compute_results(total_def):
    results = dict() #dizionario con tutte le liste di definizioni processate
    for word in total_def.keys(): 
        definitions = total_def[word] 
        
        avg = 0
        count = 0
        for def1 in definitions:
            for def2 in definitions:
                if not def1 == def2:
                    avg += cosine_similarity(def1, def2)
                    count += 1
        
        results[word] = avg / count
        
    return results

In [7]:
def most_frequent_words(definitions):
    words = set([word for definition in definitions for word in definition])

    freq_words = dict()

    for w in words:

        for d in definitions:

            if w in d:

                if w not in freq_words.keys():
                    freq_words[w]=1
                else:
                    count = freq_words[w] + 1
                    freq_words[w]= count

        if w in freq_words:
            freq_words[w] += 1

        else:
            freq_words[w] = 1

    
    most_frequent_words = []
    
    for word in freq_words.keys():
        if freq_words[word] >= (0.5 * len(definitions)):
            most_frequent_words.append(word)

    return most_frequent_words

In [8]:
with open('defs.csv') as csvfile:
    total_def = definitions(csvfile)
    results = compute_results(total_def)
    print (' Similiarita del coseno : ' , results)
    print('\n Most Frequent words : ')
    print ([(key, most_frequent_words(total_def[key])) for key in ['Courage', 'Paper', 'Apprehension', 'Sharpener']])

 Similiarita del coseno :  {'Courage': 0.21054727554969985, 'Paper': 0.29258850377799267, 'Apprehension': 0.0830330313557733, 'Sharpener': 0.3863878711824424}

 Most Frequent words : 
[('Courage', ['ability', 'fear']), ('Paper', ['write', 'material']), ('Apprehension', []), ('Sharpener', ['tool', 'sharpen', 'pencil'])]
