In [1]:
import pandas as pd
import numpy as np
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from gensim import parsing
import re

In [2]:
!pip install rouge
from rouge import Rouge
rouge = Rouge()

from collections import Counter

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0m

In [3]:
df_train = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")

df_train

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
287108,fffdfb56fdf1a12d364562cc2b9b1d4de7481dee,By . James Rush . Former first daughter Chelse...,Chelsea Clinton said question of running for o...
287109,fffeecb8690b85de8c3faed80adbc7a978f9ae2a,An apologetic Vanilla Ice has given his first ...,"Vanilla Ice, 47 - real name Robert Van Winkle ..."
287110,ffff5231e4c71544bc6c97015cdb16c60e42b3f4,America's most lethal sniper claimed he wished...,America's most lethal sniper made comment in i...
287111,ffff924b14a8d82058b6c1c5368ff1113c1632af,"By . Sara Malm . PUBLISHED: . 12:19 EST, 8 Mar...",A swarm of more than one million has crossed b...


In [4]:
number_doc = 30000
abstract = []
introduction = []
cnt = 0

for index, row in df_train.iterrows():
    cnt += 1
    abstract.append(row['highlights'])
    introduction.append(row['article'])
    if cnt == number_doc:
        break

abstract = np.array(abstract)
introduction = np.array(introduction)

In [5]:
abstract[43]

'Heidi and Maryellen Olson conceived first daughter using donor sperm .\nHeidi gave birth to Sequoia on Nov 7, Maryellen wanted to breastfeed too .\nMaryellen followed a program of birth control pills and a milk-enhancing drug for three months before the birth at home in Santa Cruz, California .\nNow, both mothers nurse their daughter, claim it enhances their closeness .'

In [6]:
introduction[43]

'It is one of the key bonding experiences for a mother and child: breastfeeding. So lesbian partners Heidi and Maryellen Olson wanted to share that with their first-born Sequoia. They both nurse their one-month-old daughter. Phenomenon: Maryellen Olson, 25, nurses her daughter Sequoia despite not giving birth . Both breastfeeding: Her wife Heidi, 26, naturally lactates meaning they can both nurse their first-born . Dubbed \'co-nursing\', the little-known practice requires Maryellen, who didn\'t give birth, to induce lactation and take a natural drug to boost breastmilk. According to the new parents, it has changed everything. \'It is so worth it for the closeness I feel with Sequoia, and also for the sanity-saving it provides both of us,\' graduate student Maryellen, 25, told MailOnline from the family\'s home in Santa Cruz, California. \'We both feel pretty amazed. \'It\'s amazing to see what our bodies can do, and we felt lucky to have this additional bonding experience available for

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import string

# tokens = word_tokenize(text.lower())

# stop_words = set(stopwords.words('english'))
# filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def get_Sentences(introduction):
    sentences = sent_tokenize(introduction.lower())
    sentences = [s for s in sentences if len(s.strip()) > 1]
    return sentences

In [9]:
def preprocessing(sentences):

    # remove_emails = lambda s: re.sub(r'^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+$', '', s)
    # remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)

    CLEAN_FILTERS = [
                    #remove_emails,
                    strip_tags,
                    strip_numeric, 
                    strip_multiple_whitespaces]

    processed_words = preprocess_string(sentences, CLEAN_FILTERS)
    text = ' '.join(processed_words)
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    stop_words.add('\x1a')
    punctuations = set(string.punctuation)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    return filtered_tokens

In [10]:
def get_word_freq(tokens):
    word_freq = Counter(tokens)
    #print(word_freq)
    total_words = sum(word_freq.values())
    #print(total_words)
    return word_freq, total_words

In [11]:
def get_word_pd(word_freq,total_words):
    word_pd = {word: count / total_words for word, count in word_freq.items()}
    #print(word_pd)
    return word_pd

In [12]:
# Calculate the sentence scores based on KL divergence
# doc * log (doc/sentences)
def KL(sentences,word_pd):
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_words = preprocessing(sentence.lower())
        sentence_word_freq = Counter(sentence_words)
        sent_ps = {word: count/len(sentence_words) for word, count in sentence_word_freq.items()}

        #check kl divergence value for each sentence and store it in an array
        score = 0
        for word in sentence_words:
            if word in word_pd and word in sent_ps:
                score += word_pd[word] * np.log(word_pd[word]/sent_ps[word])

        sentence_scores[i] = score
    return sentence_scores

In [13]:
def get_Summary(sentence_scores,no_sentences):
  # all_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
  # print(all_sentences)
  # Sort the sentences in descending order of scores and select the top summary_size sentences
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:no_sentences]
    summary_sentences = sorted(summary_sentences)
    # print(summary_sentences)
    summary = ' '.join([sentences[i] for i in summary_sentences])
    return summary

In [14]:
def get_rouge_score(summary, abstract):
    scores = rouge.get_scores(summary, abstract)
    return scores

In [15]:
def preprocessing_summaries(sentences):

    # remove_emails = lambda s: re.sub(r'^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+$', '', s)
    # remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)

    CLEAN_FILTERS = [
                    #remove_emails,
                    strip_tags,
                    strip_numeric, 
                    strip_multiple_whitespaces]

    processed_words = preprocess_string(sentences, CLEAN_FILTERS)
    text = ' '.join(processed_words)
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    stop_words.add('\x1a')
    punctuations = set(string.punctuation)
    filtered_tokens = ' '.join(token for token in tokens if token.lower() not in stop_words)
    
    return filtered_tokens

In [16]:
import warnings
warnings.filterwarnings('ignore')

columns = ['summary','gold_summary','ROUGE-1 Precision','ROUGE-1 Recall','ROUGE-1 F1','ROUGE-L Precision','ROUGE-L Recall','ROUGE-L F1']
df = pd.DataFrame(columns=columns)

for i in range(len(abstract)):

    sentences = get_Sentences(introduction[i])
    tokens = preprocessing(introduction[i])
    word_freq, total_words = get_word_freq(tokens)
    word_pd = get_word_pd(word_freq,total_words)
    sentence_scores = KL(sentences,word_pd)
    summary = get_Summary(sentence_scores,6)
    # print(summary)
    gold_summary = abstract[i].lower()
    #print(i)
    scores = get_rouge_score(preprocessing_summaries(summary), preprocessing_summaries(gold_summary))
    all_scores = {
    'summary': summary,
    'gold_summary':gold_summary,
    'ROUGE-1 Precision': scores[0]['rouge-1']['p'],
    'ROUGE-1 Recall': scores[0]['rouge-1']['r'],
    'ROUGE-1 F1': scores[0]['rouge-1']['f'],
    'ROUGE-L Precision': scores[0]['rouge-l']['p'],
    'ROUGE-L Recall': scores[0]['rouge-l']['r'],
    'ROUGE-L F1': scores[0]['rouge-l']['f']}
    new_row = pd.DataFrame(all_scores, index=[0])
    df = df.append(new_row, ignore_index=True)


In [17]:
df

Unnamed: 0,summary,gold_summary,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 F1,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L F1
0,associated press . published: . | . updated: ....,"bishop john folda, of north dakota, is taking ...",0.029412,0.045455,0.035714,0.029412,0.045455,0.035714
1,(cnn) -- ralph mata was an internal affairs li...,criminal complaint: cop used his role to help ...,0.147727,0.464286,0.224138,0.147727,0.464286,0.224138
2,she died later from her injuries . miss titley...,"craig eccleston-todd, 27, had drunk at least t...",0.108108,0.097561,0.102564,0.108108,0.097561,0.102564
3,"sixty years prior, ukraine's breakaway peninsu...",nina dos santos says europe must be ready to a...,0.089286,0.156250,0.113636,0.071429,0.125000,0.090909
4,the cod army are playing in the third tier for...,fleetwood top of league one after 2-0 win at s...,0.102941,0.162791,0.126126,0.102941,0.162791,0.126126
...,...,...,...,...,...,...,...,...
29995,mahatma ghandi is spliced with wise jedi teach...,sculptor mike leavitt says he was inspired by ...,0.020408,0.045455,0.028169,0.020408,0.045455,0.028169
29996,sara smyth . staff have not been paid since au...,defunct airline grounded in october 2012 over ...,0.076923,0.235294,0.115942,0.076923,0.235294,0.115942
29997,join the club. they offered up the spoiler foi...,"valentine day's weekend was binge time for ""ho...",0.078947,0.100000,0.088235,0.052632,0.066667,0.058824
29998,(cnn) -- a plane crashed early sunday in the s...,new: official says the pilot saved lives by gu...,0.243902,0.588235,0.344828,0.207317,0.500000,0.293103


In [18]:
sorted_df = df.sort_values(by=['ROUGE-1 F1'], ascending=False)
sorted_df.reset_index(drop=True)

top_5_rows = sorted_df.head(5)

top_5_rows

Unnamed: 0,summary,gold_summary,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 F1,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L F1
17285,police say a maryland teenager eating a pot br...,student gave teacher a piece when asked for a ...,0.632653,0.911765,0.746988,0.612245,0.882353,0.722892
4361,scroll down for video . the baby died at its h...,the baby died at its home in fort plains on tu...,0.557377,0.971429,0.708333,0.557377,0.971429,0.708333
19765,it shows a ‘worryingly vast variation’ in acce...,reports shows a ‘worryingly vast variation’ in...,0.561404,0.941176,0.703297,0.561404,0.941176,0.703297
7028,a french physician who treated michael schumac...,"michael schumacher is no longer in a coma, say...",0.592593,0.864865,0.703297,0.537037,0.783784,0.637363
13039,ex-nba star michael jordan advised tom watson...,the chicago bulls star has been supporting tea...,0.568182,0.806452,0.666667,0.568182,0.806452,0.666667


In [19]:
avg_precision = df['ROUGE-1 Precision'].mean()
avg_recall = df['ROUGE-1 Recall'].mean()
avg_f1 = df['ROUGE-1 F1'].mean()
avg_l_precision = df['ROUGE-L Precision'].mean()
avg_l_recall = df['ROUGE-L Recall'].mean()
avg_l_f1 = df['ROUGE-L F1'].mean()

# Print the results
print("Average ROUGE-1 Precision: ", avg_precision)
print("Average ROUGE-1 Recall: ", avg_recall)
print("Average ROUGE-1 F1: ", avg_f1)
print("Average ROUGE-L Precision: ", avg_l_precision)
print("Average ROUGE-L Recall: ", avg_l_recall)
print("Average ROUGE-L F1: ", avg_l_f1)

Average ROUGE-1 Precision:  0.12596687501775447
Average ROUGE-1 Recall:  0.28115178094734683
Average ROUGE-1 F1:  0.16798822862813623
Average ROUGE-L Precision:  0.11831457232961373
Average ROUGE-L Recall:  0.2638127356956711
Average ROUGE-L F1:  0.15770930599183394


In [20]:
df['summary'][17285]

"police say a maryland teenager eating a pot brownie in class panicked when his teacher asked him for a piece of the treat, and that he is now charged for obliging. anne arundel county police said tuesday that the 17-year-old didn't tell the teacher that the brownie contained marijuana. she began feeling ill and acting disoriented, and was brought to the nurse's office at broadneck high school in annapolis on monday. police say a maryland teenager eating a pot brownie in class at broadneck high school panicked when his teacher asked him for a piece of the treat, and that he is now charged for obliging . police did not release his name, and they say he was released to a guardian. the school is taking action through its own code of conduct separately."

In [21]:
df['gold_summary'][17285]

"student gave teacher a piece when asked for a bite of the treat .\nanne arundel county police said tuesday that the 17-year-old didn't tell the teacher that the brownie contained marijuana .\nteacher began feeling ill and acting disoriented, and was brought to the nurse's office at broadneck high school in annapolis on monday ."

In [22]:
introduction[17285]

"Police say a Maryland teenager eating a pot brownie in class panicked when his teacher asked him for a piece of the treat, and that he is now charged for obliging. Anne Arundel County police said Tuesday that the 17-year-old didn't tell the teacher that the brownie contained marijuana. She began feeling ill and acting disoriented, and was brought to the nurse's office at Broadneck High School in Annapolis on Monday. Police say a Maryland teenager eating a pot brownie in class at Broadneck High School panicked when his teacher asked him for a piece of the treat, and that he is now charged for obliging . The teacher told police that the student had given her the brownie during third period and that she suspected it contained marijuana. The student, from nearby Arnold, was charged with juvenile counts of administering a dangerous substance, assault and reckless endangerment. Police did not release his name, and they say he was released to a guardian. The school is taking action through i