In [1]:
import json
from yaml import safe_load
import pandas as pd
import re
# Extractive
from nltk.tokenize import WordPunctTokenizer, sent_tokenize, PunktSentenceTokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.random import RandomSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.reduction import ReductionSummarizer
from sumy.utils import get_stop_words

from sumy.evaluation.rouge import (rouge_1, rouge_2, 
                                   rouge_l_sentence_level,
                                   rouge_l_summary_level, rouge_n)
from sumy.models.dom._sentence import Sentence

import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

import progressbar

In [155]:
print("\n\t\t SUMMARIZATION REVIEW\t\t\n")
print('[INFO] Loading configuration')
with open("./config.yml", 'r') as file:
    config_var = safe_load(file)["main"]

print("[INFO] Loading json data from")
with open(
        str(config_var['dataset_folder'])+"/"+str(config_var['data_to_use']),
        'r'
        ) as file:
    data = pd.DataFrame(json.load(file))
    
print("[INFO] Removing articles without summary or paragraphs")
print("[INFO] Size before cleaning:", len(data))
data = data[(data["Summary"].map(len) >= 1) &
            (data["Paragraphs"].map(len) >= 1)]
print("[INFO] Size after cleaning:", len(data))    
data["Paragraphs_as_string"] = data["Paragraphs"].apply(
    lambda x: "\n\n".join(x))


		 SUMMARIZATION REVIEW		

[INFO] Loading configuration
[INFO] Loading json data from
[INFO] Removing articles without summary or paragraphs
[INFO] Size before cleaning: 10335
[INFO] Size after cleaning: 9480


In [104]:
data = data.reset_index()

In [119]:
punkt_tokenizer = PunktSentenceTokenizer(
    train_text="\n".join([sent for sent in data["Paragraphs_as_string"]]))

In [122]:
for i, el in enumerate(data["Summary"]):
    print(punkt_tokenizer.tokenize(text=el))
    if i == 50:
        break

['Raking in cash for Biden, the former president eases off the gloves: This is your morning tip sheet.']
['In the early fall, Ms. Warren seemed to be the Democratic candidate to beat in Iowa.', 'But many voters have had second thoughts over how her sweeping agenda would sell against President Trump.']
['Senator David Perdue, a Republican, drew a quick rebuke from his Democratic opponent, Jon Ossoff, who said the Facebook ad employed the “least original anti-Semitic trope in history.”']
['But will Republicans’ celebration of their president deliver the agenda-setting boost he needs?']
['“Democrats were organizing with every tool that didn’t require crossing the six-foot social distancing barrier,” the chair of the state Democratic Party said.']
['She isn’t optimistic that Congress will help schools reopen safely in the fall: “There’s going to be a lot of parents in tears.”']
['Bobby White was celebrated as the “Basketball Cop” after millions saw a video of him shooting hoops with local 

Vado a calcolare il numero di frasi che vengono tokenizzate.

# LSA

## Test su 1

In [34]:
parser = PlaintextParser.from_string(data["Paragraphs_as_string"][1], 
                                     tokenizer = Tokenizer('english'))

stemmer = Stemmer('english')
summarizer = LsaSummarizer(stemmer)

summarizer.stop_words = get_stop_words('english')
LSA_summaries = [sentence for sentence in summarizer(parser.document, 2)]

reference_sentences = [
    Sentence(el, tokenizer=Tokenizer("english"))
    for el in sent_tokenize(data["Summary"][1])
]

print("Rouge_1", rouge_1(evaluated_sentences=LSA_summaries,
        reference_sentences=reference_sentences))
print("Rouge_2", rouge_2(evaluated_sentences=LSA_summaries,
        reference_sentences=reference_sentences))

rouge_l_sentence_level(evaluated_sentences=LSA_summaries,
        reference_sentences=reference_sentences)

rouge_l_summary_level(evaluated_sentences=LSA_summaries,
        reference_sentences=reference_sentences)

## Script su tutti

In [3]:
data = data.reset_index()

In [4]:
stemmer = Stemmer('english')
summarizer_LSA = LsaSummarizer(stemmer)
summarizer_Luhn = LuhnSummarizer(stemmer)
summarizer_Sum = SumBasicSummarizer(stemmer)
summarizer_Lex = LexRankSummarizer(stemmer)
# summarizer_KLS = KLSummarizer(stemmer)
summarizer_Random = RandomSummarizer(stemmer)
summarizer_Red = ReductionSummarizer(stemmer)
for summarizer in [
        summarizer_LSA, summarizer_Luhn, summarizer_Sum, 
    summarizer_Lex, summarizer_Red
]:
    summarizer.stop_words = get_stop_words('english')

dict_res = {}
for name, summarizer in zip(
    ["LSA", "Lunh", "SumBasic", "LexRank", "Reduction"], [
        summarizer_LSA, summarizer_Luhn, summarizer_Sum, summarizer_Lex,
        summarizer_Red
    ]):
    print("\n", name, "\n")
    results_rouge_1 = []
    results_rouge_2 = []
    results_rouge_l_1 = []
    results_rouge_l_2 = []
    for i in progressbar.progressbar(range(len(data))):
        (article, summary) = (data["Paragraphs_as_string"][i],
                              data["Summary"][i])
        try:
            parser = PlaintextParser.from_string(
                article, tokenizer=Tokenizer('english'))

            summaries = [
                sentence for sentence in summarizer(parser.document, 2)
            ]

            #     To use sumy's evaluation functions, I need to have the text in
            #     Sentence objects
            reference_sentences = [
                Sentence(sent, tokenizer=Tokenizer("english"))
                for sent in sent_tokenize(summary)
            ]
            results_rouge_1.append(
                rouge_1(evaluated_sentences=summaries,
                        reference_sentences=reference_sentences))

            results_rouge_2.append(
                rouge_2(evaluated_sentences=summaries,
                        reference_sentences=reference_sentences))

            results_rouge_l_1.append(
                rouge_l_sentence_level(
                    evaluated_sentences=summaries,
                    reference_sentences=reference_sentences))

            results_rouge_l_2.append(
                rouge_l_summary_level(evaluated_sentences=summaries,
                                      reference_sentences=reference_sentences))

        except:
            pass
#         Save results and progress to next summarizer
    dict_res[name] = {
        "Rouge_1": results_rouge_1,
        "Rouge_2": results_rouge_2,
        "Rouge_L_sentence_level": results_rouge_l_1,
        "Rouge_L_summary_level": results_rouge_l_2
    }

N/A% (0 of 9480) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--


 LSA 



100% (9480 of 9480) |####################| Elapsed Time: 0:07:50 Time:  0:07:50
  0% (3 of 9480) |                       | Elapsed Time: 0:00:00 ETA:   0:05:28


 Lunh 



100% (9480 of 9480) |####################| Elapsed Time: 0:05:12 Time:  0:05:12
  0% (7 of 9480) |                       | Elapsed Time: 0:00:00 ETA:   0:03:51


 SumBasic 



100% (9480 of 9480) |####################| Elapsed Time: 0:02:20 Time:  0:02:20
  0% (3 of 9480) |                       | Elapsed Time: 0:00:00 ETA:   0:06:03


 LexRank 



100% (9480 of 9480) |####################| Elapsed Time: 0:07:12 Time:  0:07:12
  0% (3 of 9480) |                       | Elapsed Time: 0:00:00 ETA:   0:06:02


 KLS 



100% (9480 of 9480) |####################| Elapsed Time: 0:06:37 Time:  0:06:37


In [51]:
# Create pandas dataframe for mean of results
res_mean = pd.DataFrame(columns = dict_res.keys())
# Dataframe for std of results
res_se = pd.DataFrame(columns = dict_res.keys())
for col in res_mean:
    res_mean[col] = pd.Series(
        {key: np.mean(value)
         for key, value in dict_res[col].items()})
    res_se[col] = pd.Series(
        {key: np.std(value)/np.sqrt(len(value))
         for key, value in dict_res[col].items()})
    

In [52]:
res_mean

Unnamed: 0,LSA,Lunh,SumBasic,LexRank,Reduction,Random
Rouge_1,0.393832,0.418053,0.230668,0.374518,0.448022,0.245043
Rouge_2,0.167964,0.219488,0.076637,0.191279,0.242454,0.070406
Rouge_L_sentence_level,0.138476,0.180001,0.147127,0.185174,0.179566,0.124861
Rouge_L_summary_level,0.014459,0.018343,0.037052,0.024488,0.015887,0.027457


In [54]:
res_se

Unnamed: 0,LSA,Lunh,SumBasic,LexRank,Reduction,Random
Rouge_1,0.002632,0.00301,0.002125,0.003,0.003045,0.002042
Rouge_2,0.003131,0.003601,0.002137,0.003447,0.003677,0.002053
Rouge_L_sentence_level,0.001659,0.002034,0.001488,0.002064,0.001988,0.001387
Rouge_L_summary_level,7.3e-05,0.000104,0.000211,0.000159,9.5e-05,0.000169


In [57]:
print("[INFO] Saving evaluation averages")
with open(config_var['output_folder']+"/avgs.csv", 'w') as file:
    res_mean.to_csv(file)
with open(config_var['output_folder']+"/ses.csv", 'w') as file:
    res_se.to_csv(file)

[INFO] Saving evaluation averages


# Visualizations

In [141]:
print('[INFO] Loading configuration')
with open("./config.yml", 'r') as file:
    config_var = safe_load(file)["main"]

print("[INFO] Loading Rouge results")
with open(config_var["output_folder"]+"/avgs.csv", 'r') as file:
    avgs = pd.read_csv(file, index_col = 0)
with open(config_var["output_folder"]+"/ses.csv", 'r') as file:
    ses = pd.read_csv(file, index_col = 0)    

[INFO] Loading configuration
[INFO] Loading Rouge results
[INFO] Loading json data from scraping_data_NYTimes.json
[INFO] Removing articles without summary or paragraphs
[INFO] Size before cleaning: 10335
[INFO] Size after cleaning: 9480


In [31]:
for el in avgs.index:

Rouge_1
Rouge_2
Rouge_L_sentence_level
Rouge_L_summary_level


# Manual Evaluation

In [14]:
def load_clean_data(path_to_file=None):
    print("[INFO] Loading json data from", path_to_file)
    with open(path_to_file, 'r') as file:
        data = pd.DataFrame(json.load(file))

    print("[INFO] Removing articles without summary or paragraphs")
    print("[INFO] Size before cleaning:", len(data))
    data = data[(data["Summary"].map(len) >= 1)
                & (data["Paragraphs"].map(len) >= 1)]
    print("[INFO] Size after cleaning:", len(data))

    print("[INFO] Removing summaries where the first word is 'By'")
    print("[INFO] Size before cleaning:", len(data))
    data = (data[data["Summary"].apply(lambda x: (x[0:2] != "By"))])
    print("[INFO] Size after cleaning:", len(data))
    print("[INFO] Remove 'daily briefing' articles.")
    week_day = [
        "Monday:", "Tuesday:", "Wednesday:", 'Thursday:', "Friday:",
        "Saturday:", "Sunday:"
    ]
    print("[INFO] Size before cleaning:", len(data))
    data = (data[data["Summary"].apply(lambda x:
                                       (x.split(" ")[0] not in week_day))])
    print("[INFO] Size after cleaning:", len(data))
    return data.reset_index(drop=True)

In [15]:
print("\n\t\t SUMMARIZATION MANUAL EVALUATION\t\t\n")
print('[INFO] Loading configuration.')
with open("./config.yml", 'r') as file:
    config_var = safe_load(file)["main"]
print("[INFO] Loading generated summaries.")
with open(config_var["output_folder"] + "/summaries.json", 'r') as file:
    summaries = pd.DataFrame(json.load(file))

data = load_clean_data(path_to_file=str(config_var['dataset_folder']) + "/" +
                       str(config_var['data_to_use']))


		 SUMMARIZATION MANUAL EVALUATION		

[INFO] Loading configuration.
[INFO] Loading generated summaries.
[INFO] Loading json data from ./datasets/scraping_data_NYTimes.json
[INFO] Removing articles without summary or paragraphs
[INFO] Size before cleaning: 10335
[INFO] Size after cleaning: 9480
[INFO] Removing summaries where the first word is 'By'
[INFO] Size before cleaning: 9480
[INFO] Size after cleaning: 9065
[INFO] Remove 'daily briefing' articles.
[INFO] Size before cleaning: 9065
[INFO] Size after cleaning: 8972


In [1]:
##### Initiate empty dictionary
score_dict = {}
for col in summaries:
    score_dict[col] = {}
print("[INFO] Evaluation Time")
for n, idx in enumerate(
        random.sample(range(0, len(summaries)), config_var["num_eval"])):
    print("[INFO] Test number", n)
    print('[INFO] Real summary:\n\n«', data["Summary"][idx], "»\n")
    for col in summaries:
        print("[INFO] Generate summary by " + str(col) + ":\n\n«",
              summaries[col][idx], "»\n")
        while True:
            print("[INPUT] Give score [1-5]:")
            score = int(input())
            if score > 5 or score < 1:
                print("[ERROR] Please give number between 1 and 5 (included).")
                pass
            else:
                break
        score_dict[col].update({idx: score})
print("[INFO] Saving manual evaluations.")

with open(config_var['output_folder']+"/manual_evaluation.json", 'w') as file:
    json.dump(score_dict, file)

SyntaxError: invalid syntax (<ipython-input-1-2f4ba223d1d4>, line 22)