# Automatic summarization

Risorse usate:
- Nasari

In [129]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer #tiene conto delle multiword expressions
from nltk.corpus import wordnet as wn
import json

In [130]:
stop_words = set(stopwords.words('english')) #remove stop words
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')
lemmatizer = WordNetLemmatizer()

def pre_processing(document):
    document = re.sub(r'[^\w\s]',' ',document) #remove punctuation
    document = document.lower()
    document = tokenizer.tokenize(document.split())
    document = [lemmatizer.lemmatize(token) for token in document]  
    document = [w for w in document if not w in stop_words]
    return document

## Individuazione dell'arogmento del testo

### Nasari vectors

In [142]:
nasari_vectors = pd.read_csv('data/dd-nasari.txt', on_bad_lines='skip', header=None, sep=';')
nasari_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,bn:00000002n,The Hague,hague_1372.4,rotterdam_427.35,amsterdam_415.35,city_218.8,netherlands_198.2,dutch_197.61,utrecht_186.22,the hague_160.32,page_137.18
1,bn:00000003n,.22 Long Rifle,rifle_2305.63,cartridge_2279.59,bullet_1365.01,barrel_957.51,firearm_910.38,shotgun_702.1,rimfire_563.73,caliber_535.97,ammunition_474.7
2,bn:00000005n,Tuple,number_754.0,integer_528.26,set_449.2,tuple_373.74,element_323.63,tuples_316.55,define_272.01,function_271.05,permutation_269.38
3,bn:00000006n,Dodecanol,alcohol_310.46,ethanol_74.72,dodecanol_45.46,ch_44.51,fatty_35.93,oh_33.61,carbon_33.53,aldehyde_32.11,methanol_31.53
4,bn:00000013n,Million,million_209.35,number_146.31,mathematics_61.3,long scale_53.31,real number_50.43,numeral_50.35,short scale_50.12,digit_42.17,bally_41.77


In [147]:
nasari_vectors = nasari_vectors.set_index(0)

In [287]:
text = open('data/docs/Andy-Warhol.txt', 'r', encoding='utf-8').read().split('\n')
text = [line for line in text if line != '']

text_preprocessed = [pre_processing(line) for line in text[0:]]
print(text_preprocessed)

# contesto 
contesto_testo = []
for frase in text_preprocessed:
    contesto_testo = contesto_testo + frase
contesto_testo = set(contesto_testo)

# titolo
title = text_preprocessed[0]
title

[['andy warhol', 'great', 'pop', 'artist', 'thought', 'trump', 'sort of', 'cheap'], ['anticipated', 'celebrity', 'culture', 'social', 'medium', 'thought', 'artist', 'more than', 'hold', 'paintbrush', 'wound up', 'john lennon', 'new', 'tate', 'exhibition', 'open', 'alastair', 'smart', 'show', 'far', 'important', 'artist', 'modern', 'age', 'wa', 'ahead', 'time'], ['uring', 'last', 'year', 'super', 'bowl', '100', 'million', 'u', 'viewer', 'treated', 'unexpected', 'sight', 'one', 'commercial', 'break', 'wa', 'andy warhol', 'nothing', 'more than', 'taking', 'bite', 'out of', 'burger', 'king', 'whopper', 'adding', 'occasional', 'bit', 'ketchup', '45', 'second'], ['wa', 'music', 'punchline', 'a little', 'light', 'rustling', 'burger', 'wrapper', 'slowly', 'unfolding', 'scene', 'culminated', 'hashtag', 'eatlikeandy', 'wa', 'far', 'removed', 'one', 'could', 'imagine', 'big', 'budget', 'ad', 'traditionally', 'shown', 'super', 'bowl'], ['reaction', 'social', 'medium', 'wa', 'swift', 'widespread', 

['andy warhol',
 'great',
 'pop',
 'artist',
 'thought',
 'trump',
 'sort of',
 'cheap']

### Babelnet id delle parole del titolo

In [133]:
import requests

babelnet_token = '1e258739-f5e4-4961-8267-a2da4fe94572'

ids = {}
for word in title:
    #response = requests.get(f'https://babelnet.io/v8/getSynsetIds?lemma={word}&searchLang=EN&key={babelnet_token}')
    #ids[word] = response.json()

In [134]:
#salva ids in un file
with open('data/ids.json', 'w') as fp:
    json.dump(ids, fp)

In [135]:
#carica ids da file
with open('data/ids.json', 'r') as fp:
    ids = json.load(fp)

### WSD dei babelnet id

In [137]:
def getSignature(id):
    response = requests.get(f'https://babelnet.io/v8/getSynset?id={id}&key={babelnet_token}')
    glosses = response.json()['glosses']
    examples = response.json()['examples']
    signature = ""
    for gloss in glosses:
        signature += gloss['gloss'] + ' '
    for example in examples:
        signature += example['example'] + ' '
    return set(pre_processing(signature))

# Usa come contesto l'intero testo del file, non solo il titolo
def SimplifiedLesk(bn_ids, context):
    best_sense = bn_ids[0]['id']
    max_overlap = 0
    
    for sense in bn_ids:
        signature = getSignature(sense['id'])
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense['id']
    
    return best_sense

In [138]:
sensi = {}
for word in ids:
    sensi[word] = SimplifiedLesk(ids[word], contesto_testo)

### Creazione del contesto

In [220]:
title_vectors = {}

#fai un ciclo su ids
for word in sensi:
    # check if ids[word] is in nasari_vectors
    if sensi[word] in list(nasari_vectors.index.values):
        print(f'{sensi[word]} in nasari_vectors') 
        title_vectors[word] = nasari_vectors.loc[sensi[word]]
    else:
        print(f'{sensi[word]} not in nasari_vectors')

bn:00004020n in nasari_vectors
bn:00103780a not in nasari_vectors
bn:00063586n in nasari_vectors
bn:00060201n in nasari_vectors
bn:00017339n in nasari_vectors
bn:03266978n in nasari_vectors
bn:00116064r not in nasari_vectors
bn:02213915n in nasari_vectors


In [258]:
key_words = []
for vector in title_vectors:
    i = 1
    for word in title_vectors[vector]:
        word_to_added = word.split('_')[0]
        word_to_added = re.sub(r'[^\w\s]',' ',word_to_added) #remove punctuation
        word_to_added = word_to_added.lower()
        word_to_added = lemmatizer.lemmatize(word_to_added)

        key_words.append((word_to_added, i))
        i += 1

key_words = set(key_words)

## Pesatura dei paragrafi

Eseguita usanto la metrica Weighted Oveerlap

In [272]:
def weighted_overlap(sentence):
    numeratore = 0
    for word in sentence:
        #check if word is in first column of key_words
        if word in [x[0] for x in key_words]:
            #print(f'{word} in key_words')
            #get index of word in key_words
            index = [x[1] for x in key_words if x[0] == word][0]
            #print(f'index: {index}')

            numeratore += 1/(index)
            #print('\n')
       # else:
            #print(f'{word} not in key_words')
            #print('\n')

    i = 1
    denominatore = 0
    for word in key_words:
        denominatore += 1/(2*i)
        i += 1

    return numeratore/denominatore

print(weighted_overlap(['Cheap Records', 'vienna', 'Matri']))

0.22226147170498


In [300]:
testo = text_preprocessed[1:]

weights = {}
i = 0
for line in testo:
    # add weighted_overlap(line) to weights
    weights[i] = weighted_overlap(line)
    i += 1

#order weights by value
weights = {k: v for k, v in sorted(weights.items(), key=lambda item: item[1], reverse=True)}

#prendi l'80% del testo
numero_righe = round(len(testo)*0.8)

testo_finale = text.copy()
title_finale = testo_finale[0]
#remove first element in testo_finale
testo_finale.pop(0)

#get only first 10 element of weights
weights = {k: weights[k] for k in list(weights)[:numero_righe]}
#order weights by key
weights = {k: v for k, v in sorted(weights.items(), key=lambda item: item[0])}

#remove from testo_finale all element that are not in weights
summary = [testo_finale[i] for i in weights.keys()]

#join with \n
summary = '\n\n'.join(summary)
summary = title_finale + '\n\n' + summary

print(summary)

Andy Warhol: Why the great Pop artist thought ‘Trump is sort of cheap’

He anticipated celebrity culture and social media, thought artists should do more than just hold a paintbrush, and wound up John Lennon. As a new Tate exhibition opens, Alastair Smart shows how far the most important artist of the modern age was ahead of his time.

uring last year’s Super Bowl, 100 million US viewers were treated to a most unexpected sight in one of the commercial breaks. It was Andy Warhol doing nothing more than taking bites out of a Burger King Whopper – and adding the occasional bit of ketchup – for 45 seconds.

There was no music, no punchline, just a little, light rustling of the burger’s wrapper – in a slowly unfolding scene that culminated with the hashtag #EatLikeAndy. It was about as far removed as one could imagine from the big-budget ads traditionally shown during the Super Bowl.

Reaction on social media was swift, widespread and mostly containing the word “bizarre”. Why show a low-act