## Loading libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
#nltk.download('punkt')
import re
import warnings
warnings.filterwarnings('ignore')

## Loading data

In [2]:
data = pd.read_json("data/dev.jsonl/dev-stats.jsonl", lines=True)

In [3]:
data.head()

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
0,http://www.foxsports.com/baseball/xchange/team...,http://web.archive.org/web/19980117162148id_/h...,Pro Sports Xchange notes,1970-08-20 06:01:57.162148,So sayeth Padre general manager Kevin Towers.\...,SAN DIEGO PADRES team notebook,209.0,0.8,1.2,high,medium,abstractive
1,http://www.nytimes.com/2006/06/05/technology/0...,http://web.archive.org/web/20060620021852id_/h...,India Becoming a Crucial Cog in the Machine at...,1970-08-21 04:23:40.021852,"BANGALORE, India, June 4  The world's biggest...",India provides I.B.M. with its fastest-growing...,56.045455,0.954545,16.5,high,high,extractive
2,http://www.nydailynews.com/archives/news/1995/...,http://web.archive.org/web/20110210093603id_/h...,NEW YORKERS' ONLY REGRET WAS STAYING HOME,1970-08-21 18:10:10.093603,"This story was reported by: NICK CHARLES, AUST...",As many black men marched on Washington yester...,6.152941,0.976471,24.6,low,high,extractive
3,http://mashable.com/2010/10/16/twitter-top-top...,http://web.archive.org/web/20120123100903id_/h...,Top 10 Twitter Trends This Week [CHART],1970-08-21 20:55:23.100903,Remember when everyone on Earth was glued to T...,Check out the chart to find out what the world...,14.631579,0.842105,1.368421,low,medium,abstractive
4,http://www.reuters.com/article/2011/01/31/us-i...,http://web.archive.org/web/20120321005702id_/h...,Freed American hiker summoned back by Iran court,1970-08-21 20:58:41.005702,"TEHRAN | Mon Jan 31, 2011 9:17am EST\n\nTEHRAN...",TEHRAN (Reuters) - An American woman who was f...,7.902439,1.0,39.04878,low,high,extractive


In [4]:
data.shape

(108837, 12)

In [5]:
data['text'][2]

'This story was reported by: NICK CHARLES, AUSTIN EVANS FENNER AND SAMSON MULUGETA It was written by: KAREN HUNTER\n\nTuesday, October 17th 1995, 4:20AM\n\nAs many black men marched on Washington yesterday, some New Yorkers spoke of their pride in the event and their disappointment in not being there, too.\n\n"I felt like the only black person working," said Roderick Vinson, 38, of Harlem. "That feeling made me sick to my stomach. I couldn\'t believe I missed one of the important events of my life."\n\nWinston Ford, 50, had to work, too. He makes his living selling incense and body oils in Brooklyn.\n\n"I didn\'t have the finances to make the trip," he said. "But my heart and soul is with them in Washington."\n\nFor HIV-positive Sheldon Julius of Harlem, the Million Man March was a wakeup call. Long an absentee father, he called his 15-year-old son Sunday night and for the first time ever told him that he loved him. "The calling of the march made me realize my responsibility," he said.

In [6]:
data['summary'][2]

'As many black men marched on Washington yesterday, some New Yorkers spoke of their pride in the event and their disappointment in not being there, too. "I felt like the only black person working,"said Roderick Vinson, 38, of Harlem. "That feeling made me sick to my stomach. I couldn\'t believe I missed one of the important events of my life."Winston Ford, 50, had to work, too. He makes his living selling'

In [7]:
data = data.loc[0:999, :]

In [8]:
data.shape

(1000, 12)

## Tokenizing the text

In [9]:
from nltk.tokenize import sent_tokenize
def split_sentences(text):
    sentences = []
    sentences.append(sent_tokenize(text))
    sentences = [y for x in sentences for y in x]
    return sentences

In [10]:
data = data.dropna()

## Cleaning the text

In [11]:
data['text'] = data['text'].str.lower()

In [12]:
data['sentences'] = data['text'].map(lambda x: split_sentences(x))

In [13]:
data['sentences'][2]

['this story was reported by: nick charles, austin evans fenner and samson mulugeta it was written by: karen hunter\n\ntuesday, october 17th 1995, 4:20am\n\nas many black men marched on washington yesterday, some new yorkers spoke of their pride in the event and their disappointment in not being there, too.',
 '"i felt like the only black person working," said roderick vinson, 38, of harlem.',
 '"that feeling made me sick to my stomach.',
 'i couldn\'t believe i missed one of the important events of my life."',
 'winston ford, 50, had to work, too.',
 'he makes his living selling incense and body oils in brooklyn.',
 '"i didn\'t have the finances to make the trip," he said.',
 '"but my heart and soul is with them in washington."',
 'for hiv-positive sheldon julius of harlem, the million man march was a wakeup call.',
 'long an absentee father, he called his 15-year-old son sunday night and for the first time ever told him that he loved him.',
 '"the calling of the march made me realize

In [14]:
data['clean_sentences'] = data['sentences'].replace("[^a-zA-Z]", " ")

In [15]:
data['clean_sentences'][2]

['this story was reported by: nick charles, austin evans fenner and samson mulugeta it was written by: karen hunter\n\ntuesday, october 17th 1995, 4:20am\n\nas many black men marched on washington yesterday, some new yorkers spoke of their pride in the event and their disappointment in not being there, too.',
 '"i felt like the only black person working," said roderick vinson, 38, of harlem.',
 '"that feeling made me sick to my stomach.',
 'i couldn\'t believe i missed one of the important events of my life."',
 'winston ford, 50, had to work, too.',
 'he makes his living selling incense and body oils in brooklyn.',
 '"i didn\'t have the finances to make the trip," he said.',
 '"but my heart and soul is with them in washington."',
 'for hiv-positive sheldon julius of harlem, the million man march was a wakeup call.',
 'long an absentee father, he called his 15-year-old son sunday night and for the first time ever told him that he loved him.',
 '"the calling of the march made me realize

In [16]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [17]:
#!unzip glove*.zip

## Vectorizing the text

In [18]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [19]:
len(word_embeddings)

400000

In [20]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [22]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [23]:
for i in range(1000):
    data['clean_sentences'][i] = [remove_stopwords(r.split()) for r in data['clean_sentences'][i]]

In [24]:
data['clean_sentences'][2]

['story reported by: nick charles, austin evans fenner samson mulugeta written by: karen hunter tuesday, october 17th 1995, 4:20am many black men marched washington yesterday, new yorkers spoke pride event disappointment there, too.',
 '"i felt like black person working," said roderick vinson, 38, harlem.',
 '"that feeling made sick stomach.',
 'believe missed one important events life."',
 'winston ford, 50, work, too.',
 'makes living selling incense body oils brooklyn.',
 '"i finances make trip," said.',
 '"but heart soul washington."',
 'hiv-positive sheldon julius harlem, million man march wakeup call.',
 'long absentee father, called 15-year-old son sunday night first time ever told loved him.',
 '"the calling march made realize responsibility," said.',
 'black new yorkers said use march organizer louis farrakhan made apologies missing rally.',
 '"farrakhan\'s wrong," said allen washington, 61, retired triborough bridge tunnel authority worker.',
 '"whites blacks need other.',
 '

In [25]:
def sentence_vectors(text):
    sentence_vectors = []
    for i in text:
        if(len(i) != 0):
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    sentence_vectors = pd.Series(sentence_vectors)
    return sentence_vectors

In [26]:
data['sentence_vectors'] = ""
for i in range(1000):
    data['sentence_vectors'][i] = sentence_vectors(data['clean_sentences'][i])

In [27]:
data.head(2)

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin,sentences,clean_sentences,sentence_vectors
0,http://www.foxsports.com/baseball/xchange/team...,http://web.archive.org/web/19980117162148id_/h...,Pro Sports Xchange notes,1970-08-20 06:01:57.162148,so sayeth padre general manager kevin towers.\...,SAN DIEGO PADRES team notebook,209.0,0.8,1.2,high,medium,abstractive,[so sayeth padre general manager kevin towers....,"[sayeth padre general manager kevin towers., l...","0 [-0.16768333058102172, 0.017415263457728..."
1,http://www.nytimes.com/2006/06/05/technology/0...,http://web.archive.org/web/20060620021852id_/h...,India Becoming a Crucial Cog in the Machine at...,1970-08-21 04:23:40.021852,"bangalore, india, june 4  the world's biggest...",India provides I.B.M. with its fastest-growing...,56.045455,0.954545,16.5,high,high,extractive,"[bangalore, india, june 4  the world's bigges...","[bangalore, india, june 4  world's biggest co...","0 [0.0015724692273276374, 0.04247997191411..."


## Generating similarity matrix using cosine similarity

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
def cosine_similarity(sentences, sentence_vectors):
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    return sim_mat

In [30]:
data['sim_mat'] = ""
for i in range(1000):
    data['sim_mat'][i] = cosine_similarity(data['sentences'][i], data['sentence_vectors'][i])

## Converting similarity matrix to network graphs

In [31]:
import networkx as nx
data['nx_graph'] = data['sim_mat'].map(lambda x: nx.from_numpy_array(x))
data['scores'] = data['nx_graph'].map(lambda x: nx.pagerank(x))

In [32]:
data.head()

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin,sentences,clean_sentences,sentence_vectors,sim_mat,nx_graph,scores
0,http://www.foxsports.com/baseball/xchange/team...,http://web.archive.org/web/19980117162148id_/h...,Pro Sports Xchange notes,1970-08-20 06:01:57.162148,so sayeth padre general manager kevin towers.\...,SAN DIEGO PADRES team notebook,209.0,0.8,1.2,high,medium,abstractive,[so sayeth padre general manager kevin towers....,"[sayeth padre general manager kevin towers., l...","0 [-0.16768333058102172, 0.017415263457728...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.0185185185185185, 1: 0.0185185185185185,..."
1,http://www.nytimes.com/2006/06/05/technology/0...,http://web.archive.org/web/20060620021852id_/h...,India Becoming a Crucial Cog in the Machine at...,1970-08-21 04:23:40.021852,"bangalore, india, june 4  the world's biggest...",India provides I.B.M. with its fastest-growing...,56.045455,0.954545,16.5,high,high,extractive,"[bangalore, india, june 4  the world's bigges...","[bangalore, india, june 4  world's biggest co...","0 [0.0015724692273276374, 0.04247997191411...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.01639344262295083, 1: 0.0163934426229508..."
2,http://www.nydailynews.com/archives/news/1995/...,http://web.archive.org/web/20110210093603id_/h...,NEW YORKERS' ONLY REGRET WAS STAYING HOME,1970-08-21 18:10:10.093603,"this story was reported by: nick charles, aust...",As many black men marched on Washington yester...,6.152941,0.976471,24.6,low,high,extractive,"[this story was reported by: nick charles, aus...","[story reported by: nick charles, austin evans...","0 [0.028961850342933044, 0.069482825481536...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.03846153846153845, 1: 0.0384615384615384..."
3,http://mashable.com/2010/10/16/twitter-top-top...,http://web.archive.org/web/20120123100903id_/h...,Top 10 Twitter Trends This Week [CHART],1970-08-21 20:55:23.100903,remember when everyone on earth was glued to t...,Check out the chart to find out what the world...,14.631579,0.842105,1.368421,low,medium,abstractive,[remember when everyone on earth was glued to ...,[remember everyone earth glued twitter (and tv...,"0 [-0.04966387454308343, 0.228652271354401...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)","{0: 0.08333333333333333, 1: 0.0833333333333333..."
4,http://www.reuters.com/article/2011/01/31/us-i...,http://web.archive.org/web/20120321005702id_/h...,Freed American hiker summoned back by Iran court,1970-08-21 20:58:41.005702,"tehran | mon jan 31, 2011 9:17am est\n\ntehran...",TEHRAN (Reuters) - An American woman who was f...,7.902439,1.0,39.04878,low,high,extractive,"[tehran | mon jan 31, 2011 9:17am est\n\ntehra...","[tehran | mon jan 31, 2011 9:17am est tehran (...","0 [0.07381196580201665, -0.045377295994149...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)","{0: 0.08333333333333333, 1: 0.0833333333333333..."


## Ranking the sentences

In [33]:
data['ranked_sentences'] = ""
for j in range(1000):
    data['ranked_sentences'][j] = sorted(((data['scores'][j][i],s) for i,s in enumerate(data['sentences'][j])), reverse=True)

In [34]:
data.head(2)

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin,sentences,clean_sentences,sentence_vectors,sim_mat,nx_graph,scores,ranked_sentences
0,http://www.foxsports.com/baseball/xchange/team...,http://web.archive.org/web/19980117162148id_/h...,Pro Sports Xchange notes,1970-08-20 06:01:57.162148,so sayeth padre general manager kevin towers.\...,SAN DIEGO PADRES team notebook,209.0,0.8,1.2,high,medium,abstractive,[so sayeth padre general manager kevin towers....,"[sayeth padre general manager kevin towers., l...","0 [-0.16768333058102172, 0.017415263457728...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.0185185185185185, 1: 0.0185185185185185,...","[(0.0185185185185185, williams, who signed a m..."
1,http://www.nytimes.com/2006/06/05/technology/0...,http://web.archive.org/web/20060620021852id_/h...,India Becoming a Crucial Cog in the Machine at...,1970-08-21 04:23:40.021852,"bangalore, india, june 4  the world's biggest...",India provides I.B.M. with its fastest-growing...,56.045455,0.954545,16.5,high,high,extractive,"[bangalore, india, june 4  the world's bigges...","[bangalore, india, june 4  world's biggest co...","0 [0.0015724692273276374, 0.04247997191411...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.01639344262295083, 1: 0.0163934426229508...","[(0.01639344262295083, they are an acknowledgm..."


In [35]:
data.shape

(1000, 19)

## Generating the summary

In [36]:
data['extracted_summary'] = ""
for i in range(1000):
    data['extracted_summary'][i] = data['ranked_sentences'][i][0:3]
    data['extracted_summary'][i] = [x[1] for x in data['extracted_summary'][i]]
    data['extracted_summary'][i] = "".join(data['extracted_summary'][i])

## Calculating the Rouge Score

In [37]:
!pip install rouge



In [38]:
from rouge import Rouge

In [39]:
rouge = Rouge()

In [47]:
data_extractive = data.loc[data['density_bin'] == 'extractive']

In [48]:
data_extractive.shape

(355, 21)

In [49]:
data_extractive['rouge_scores'] = rouge.get_scores(data_extractive['extracted_summary'], data_extractive['summary'])

In [50]:
data_extractive.head()

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,...,density_bin,sentences,clean_sentences,sentence_vectors,sim_mat,nx_graph,scores,ranked_sentences,extracted_summary,rouge_scores
1,http://www.nytimes.com/2006/06/05/technology/0...,http://web.archive.org/web/20060620021852id_/h...,India Becoming a Crucial Cog in the Machine at...,1970-08-21 04:23:40.021852,"bangalore, india, june 4  the world's biggest...",India provides I.B.M. with its fastest-growing...,56.045455,0.954545,16.5,high,...,extractive,"[bangalore, india, june 4  the world's bigges...","[bangalore, india, june 4  world's biggest co...","0 [0.0015724692273276374, 0.04247997191411...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.01639344262295083, 1: 0.0163934426229508...","[(0.01639344262295083, they are an acknowledgm...",they are an acknowledgment of india's critical...,"{'rouge-1': {'r': 0.19047619047619047, 'p': 0...."
2,http://www.nydailynews.com/archives/news/1995/...,http://web.archive.org/web/20110210093603id_/h...,NEW YORKERS' ONLY REGRET WAS STAYING HOME,1970-08-21 18:10:10.093603,"this story was reported by: nick charles, aust...",As many black men marched on Washington yester...,6.152941,0.976471,24.6,low,...,extractive,"[this story was reported by: nick charles, aus...","[story reported by: nick charles, austin evans...","0 [0.028961850342933044, 0.069482825481536...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.03846153846153845, 1: 0.0384615384615384...","[(0.03846153846153845, winston ford, 50, had t...","winston ford, 50, had to work, too.this story ...","{'rouge-1': {'r': 0.4, 'p': 0.3333333333333333..."
4,http://www.reuters.com/article/2011/01/31/us-i...,http://web.archive.org/web/20120321005702id_/h...,Freed American hiker summoned back by Iran court,1970-08-21 20:58:41.005702,"tehran | mon jan 31, 2011 9:17am est\n\ntehran...",TEHRAN (Reuters) - An American woman who was f...,7.902439,1.0,39.04878,low,...,extractive,"[tehran | mon jan 31, 2011 9:17am est\n\ntehra...","[tehran | mon jan 31, 2011 9:17am est tehran (...","0 [0.07381196580201665, -0.045377295994149...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)","{0: 0.08333333333333333, 1: 0.0833333333333333...","[(0.08333333333333333, washington has headed a...",washington has headed a global campaign to tig...,"{'rouge-1': {'r': 0.21621621621621623, 'p': 0...."
5,http://www.reuters.com/article/2007/08/17/us-c...,http://web.archive.org/web/20120606165550id_/h...,"Breast cancer vaccine looks safe, study shows",1970-08-21 21:03:26.165550,"by maggie fox, health and science editor\n\nwa...",WASHINGTON (Reuters) - A vaccine designed to t...,17.162162,1.0,35.054054,medium,...,extractive,"[by maggie fox, health and science editor\n\nw...","[maggie fox, health science editor washington ...","0 [-0.055913176166272525, 0.22485943199862...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{0: 0.04166666666666665, 1: 0.0416666666666666...","[(0.04166666666666665, writing in the journal ...","writing in the journal of clinical oncology, t...","{'rouge-1': {'r': 0.24242424242424243, 'p': 0...."
10,http://www.bostonglobe.com/arts/music/2014/01/...,http://web.archive.org/web/20140131020936id_/h...,Music review: Jake Bugg at the House of Blues,1970-08-22 02:28:51.020936,as the lights went down at the nearly sold-out...,As the lights went down at the nearly sold-out...,2.153061,0.994898,55.658163,low,...,extractive,[as the lights went down at the nearly sold-ou...,[lights went nearly sold-out house blues satur...,"0 [0.03559721074286607, 0.03871942445684222...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)","{0: 0.09999999999999999, 1: 0.0999999999999999...","[(0.09999999999999999, “definitely not the rad...","“definitely not the radio,” said 17-year-old s...","{'rouge-1': {'r': 0.33070866141732286, 'p': 0...."


In [51]:
data_extractive['rouge_scores'][2]

{'rouge-1': {'r': 0.4, 'p': 0.3333333333333333, 'f': 0.36363635867768596},
 'rouge-2': {'r': 0.323943661971831,
  'p': 0.26744186046511625,
  'f': 0.29299362561888925},
 'rouge-l': {'r': 0.4, 'p': 0.3333333333333333, 'f': 0.36363635867768596}}

In [52]:
rouge.get_scores(data_extractive['extracted_summary'], data_extractive['summary'], avg = True)

{'rouge-1': {'r': 0.269939719749675,
  'p': 0.21149304166732436,
  'f': 0.2122847138502574},
 'rouge-2': {'r': 0.1309201425273782,
  'p': 0.09356821272510599,
  'f': 0.0958279849515522},
 'rouge-l': {'r': 0.24726011296112638,
  'p': 0.19511596105546958,
  'f': 0.19518279124559362}}