# Automatic summarization

Risorse usate:
- Nasari

In [195]:
import requests
import os
import numpy as np
import pandas as pd
import re
import math
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer #tiene conto delle multiword expressions
from nltk.corpus import wordnet as wn
import json

BABELNET_TOKEN = '1e258739-f5e4-4961-8267-a2da4fe94572' #MO
#BABELNET_TOKEN = '01a5d861-2f36-45cb-8974-a2a6526530d2' #LT

## Pre-processing

Metodo utilizzato per eseguire il preprocessing delle frasi, in cui vengono effettuate le seguenti operazioni:
- Rimozione della punteggiatura
- Trasformazione delle lettere in lowercase
- Tokenizzazione della frase tenendo conte delle multiword expression
- Lemmatizzazione di tutte le parole
- Rimozione delle stop words

In [167]:
stop_words = set(stopwords.words('english')) #remove stop words
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')
lemmatizer = WordNetLemmatizer()

def pre_processing(document):
    document = re.sub(r'[^\w\s]',' ',document) #remove punctuation
    document = document.lower()
    document = tokenizer.tokenize(document.split())
    document = [lemmatizer.lemmatize(token) for token in document]  
    document = [w for w in document if not w in stop_words]
    return document

## Babelnet Id di una frase

Viene utilizzato principalmente per ottenere i Babelnet Id delle parole del titolo, che una volta sottosposti a WSD (in quanto, molto probabilmente, per ogni parola avremo più synset) ci serviranno per ottenere i vettori Nasari

In [188]:
'''
Data una frase restituisce tutti i suoi bablenet id
'''
def get_sentence_babelnet_ids(file_name, sentence):
    if os.path.exists('data/ids-'+ file_name +'.json'):
        with open('data/ids-'+ file_name +'.json') as json_file:
            ids = json.load(json_file)
    else:
        ids = {}
        # prendo gli id di babelnet per ogni parola della frase
        for word in sentence:
            ids[word] = requests.get(f'https://babelnet.io/v8/getSynsetIds?lemma={word}&searchLang=EN&key={BABELNET_TOKEN}').json()

        # prendo i synset di babelnet per ogni parola della frase
        with open('data/ids_'+ file_name +'.json', 'w') as fp:
            json.dump(ids, fp)

    return ids

'''
Dato un babelnet id guardo nel file locale se ho già le informazioni, altrimenti 
faccio una richiesta a babelnet e aggiungo la riga al file locale
'''
def get_babelnet_synset_by_id(syn_id):
    df = pd.read_csv('data/local_babelnet_syns.csv')
    name = ""
    glosses = ['']
    examples = ['']

    if syn_id in df['id'].values:
        row = df[df['id'] == syn_id]
        name = row['name'].values[0]
        if not 'nan' in str(row['glosses'].values[0]):
            glosses = row['glosses'].values[0].split(';')
        if not 'nan' in str(row['examples'].values[0]):
            examples = row['examples'].values[0].split(';')             
    else:
        response = requests.get(f'https://babelnet.io/v8/getSynset?id={syn_id}&key={BABELNET_TOKEN}').json()


        print('=============')
        print(response)
        print('=============')

        name = response['senses'][0]['properties']['fullLemma']
        glosses = ""
        for gloss in response['glosses']:
            glosses += str(gloss['gloss']) + ';'
        examples = ""
        for example in response['examples']:
            examples += str(example['example']) + ';'

        #add row to df and save it
        df = df.append({'id': syn_id, 'name': name, 'glosses': glosses, 'examples': examples}, ignore_index=True)
        df.to_csv('data/local_babelnet_syns.csv', index=False)

        glosses = glosses.split(';')
        examples = examples.split(';')

    return syn_id, name, glosses, examples

## Nasari

Estrazione dei vettori Nasari dal file locale

In [169]:
def get_nasari_vectors():
    nasari_vectors = pd.read_csv('data/dd-nasari.txt', on_bad_lines='skip', header=None, sep=';')
    nasari_vectors = nasari_vectors.set_index(0)
    return nasari_vectors

## Algoritmo Simplified Lesk

Mi server per fare il WSD dei synsets ottenuti dei token del titolo

In [198]:
'''
Dati i tutti i synset di una parola e il suo contesto, andando ad applicare
il Lesk, restituisce il synset con il contesto più simile
'''
def get_signature(bn_syn):
    _, _, glosses, examples = get_babelnet_synset_by_id(bn_syn)

    signature = ""
    for gloss in glosses:
        signature += gloss + ' '
    for example in examples:
        signature += example + ' '
    return set(pre_processing(signature))

# Usa come contesto l'intero testo del file, non solo il titolo
def simplified_lesk(bn_syns, context):
    best_sense = bn_syns[0]['id']
    max_overlap = 0
    
    for bn_syn in bn_syns:
        signature = get_signature(bn_syn['id'])
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = bn_syn['id']
    
    return best_sense

def get_best_senses(ids, context):
    senses = {}
    for word in ids:
        senses[word] = simplified_lesk(ids[word], context)

    return senses

## Estrazione del documento da tradurre

In [243]:
def open_text(file_name):
    text = open('data/docs/' + file_name + '.txt', 'r', encoding='utf-8').read().split('\n')

    # pre processing
    text = [line for line in text if line != '']
    text_preprocessed = [pre_processing(line) for line in text[0:]]

    # prendo il contesto per fare WSD dei babelnet id delle parole del testo
    context_for_wsd = []
    for sentence in text_preprocessed:
        context_for_wsd = context_for_wsd + sentence
    context_for_wsd = set(context_for_wsd)

    # prendo il titolo
    title_preprocessed = text_preprocessed[0]

    return title_preprocessed, text_preprocessed, text, context_for_wsd

# Associazione Vectors - Synsets

Associazione dei vettori Nasari ai Synset disambiguati delle parole del testo

In [210]:
def get_nasari_vectors_by_senses(senses):
    nasari_vectors = get_nasari_vectors()
    vectors = {}

    for word in senses:
        if senses[word] in list(nasari_vectors.index.values):
            #print(f'{senses[word]} in nasari_vectors') 
            vectors[word] = nasari_vectors.loc[senses[word]]
        #else:
            #print(f'{senses[word]} not in nasari_vectors')

    return vectors

## Creazione del contesto per fare text summarization

In [207]:
def get_weighted_context(vectors):
    get_weighted_context = []

    for vector in vectors:
        #print(vectors[vector])
        sum_weights = 0
        for word in vectors[vector]:
            array = word.split('_')
            if len(array) > 1:
                sum_weights += float(word.split('_')[1])
            
        for word in vectors[vector]:
            array = word.split('_')
            if len(array) > 1:    
                weight = float(word.split('_')[1]) / sum_weights
                # preprocess word
                word_to_added = word.split('_')[0]
                word_to_added = re.sub(r'[^\w\s]',' ',word_to_added) #remove punctuation
                word_to_added = word_to_added.lower()
                word_to_added = lemmatizer.lemmatize(word_to_added)

                get_weighted_context.append((word_to_added, weight))

    return get_weighted_context

## Pesatura dei paragrafi

Eseguita usanto la metrica Weighted Overlap

In [217]:
def weighted_overlap(sentence, weighted_context):
    numeratore = 0
    for word in sentence:
        #check if word is in first column of key_words
        if word in [x[0] for x in weighted_context]:
            #print(f'{word} in key_words')
            #get index of word in key_words
            index = [x[1] for x in weighted_context if x[0] == word][0]
            #print(f'index: {index}')

            numeratore += 1/(index)
            #print('\n')
       # else:
            #print(f'{word} not in key_words')
            #print('\n')

    i = 1
    denominatore = 0
    for word in weighted_context:
        denominatore += 1/(2*i)
        i += 1

    return numeratore/denominatore

## Estrazione automatica del riassunto

In [264]:
def make_summarization(text, text_preprocessed, weighted_context, perc=0.8):
    text_preprocessed = text_preprocessed[1:]

    title = text[0]
    text = text[1:]

    weight_sentences = []
    i = 0
    for line in text_preprocessed:
        # attribuisce un peso ad ogni frase
        weight_sentences.append((i, text[i], weighted_overlap(line, weighted_context)))
        i += 1

    # ordina le frasi in base al peso
    weight_sentences = sorted(weight_sentences, key=lambda tup: tup[2], reverse=True)
    # prendi il primo 80% delle weight_sentences
    weight_sentences = weight_sentences[:round(len(weight_sentences) * perc)]
    # ordina le frasi in base all'id
    weight_sentences = sorted(weight_sentences, key=lambda tup: tup[0])

    # prendi solo le frasi
    summary = [x[1] for x in weight_sentences]
    summary = '\n\n'.join(summary)
    summary = title + '\n\n' + summary
    
    print(summary)
    return summary

## Main

In [245]:
file_name = 'Andy-Warhol'

In [254]:
title_preprocessed, text_preprocessed, text, context_for_wsd = open_text(file_name)
bn_ids = get_sentence_babelnet_ids(file_name, title_preprocessed)
title_sense = get_best_senses(bn_ids, context_for_wsd)

In [255]:
title_nasari_vectors = get_nasari_vectors_by_senses(title_sense)

In [256]:
weighted_context = get_weighted_context(title_nasari_vectors)

In [265]:
summary = make_summarization(text, text_preprocessed, weighted_context)

Andy Warhol: Why the great Pop artist thought ‘Trump is sort of cheap’

He anticipated celebrity culture and social media, thought artists should do more than just hold a paintbrush, and wound up John Lennon. As a new Tate exhibition opens, Alastair Smart shows how far the most important artist of the modern age was ahead of his time.

uring last year’s Super Bowl, 100 million US viewers were treated to a most unexpected sight in one of the commercial breaks. It was Andy Warhol doing nothing more than taking bites out of a Burger King Whopper – and adding the occasional bit of ketchup – for 45 seconds.

There was no music, no punchline, just a little, light rustling of the burger’s wrapper – in a slowly unfolding scene that culminated with the hashtag #EatLikeAndy. It was about as far removed as one could imagine from the big-budget ads traditionally shown during the Super Bowl.

Reaction on social media was swift, widespread and mostly containing the word “bizarre”. Why show a low-act