In [129]:
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import nltk
from nltk.tag.stanford import StanfordNERTagger

LANGUAGE = "english"
SENTENCES_COUNT = 20


# Summarization with SUMY Library

In [None]:
#url = "https://nlp.stanford.edu/software/CRF-NER.shtml"
#parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

parser = PlaintextParser.from_file("Example1.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)

# Using LSA Summarization algorithm
#summarizer = LsaSummarizer(stemmer)

# Using TextRank Summarization algorithm
summarizer = TextRankSummarizer(stemmer)

summarizer.stop_words = get_stop_words(LANGUAGE)
summary = []
for sentence in summarizer(parser.document, SENTENCES_COUNT):
    summary.append(sentence)
    #print(sentence)

# Summarization with NER

In [None]:
jar = './stanford-ner-tagger/stanford-ner.jar'
model = './stanford-ner-tagger/ner-model-english.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

count = 0
summary_with_NER = []
for sentence in summary: 
    words = nltk.word_tokenize(str(sentence))
    ner = ner_tagger.tag(words)
    for item in ner:
        if item[1] != 'O':
            summary_with_NER.append(sentence)
            #print(sentence)
            count += 1
            break
            
print(f'Number of sentences in summary:{count}')

# Summarization with Keywords

In [None]:
import RAKE
import operator
import re
sample_file = open("Example1.txt", 'r')
sample_stop = open("SmartStoplist.txt", 'r')
text = sample_file.read()
stop = sample_stop.read()

#deleting the nextlines and putting them into list
stopwordlist = RAKE.load_stop_words("SmartStoplist.txt", '[\n]+')

stopwordpattern = RAKE.build_stop_word_regex(stopwordlist)

sentenceList = RAKE.split_sentences(text)

words = RAKE.generate_candidate_keywords(sentenceList, stopwordpattern, minCharacters=1, maxWords=1)

word_scores = RAKE.calculate_word_scores(words)

keywordcandidates = RAKE.generate_candidate_keyword_scores(words, word_scores, minFrequency = 1)


sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)

#for keyword in sortedKeywords[0:10]:
#    print ("Keyword: ", keyword[0], ", score: ", keyword[1])

count = 0
summary_with_keywords = []
for sentence in summary:
    for keyword in sortedKeywords[0:10]:
        if keyword[0] in nltk.word_tokenize(str(sentence)):
            summary_with_keywords.append(sentence)
            #print(sentence)
            count += 1
            break

print(f'Number of sentences in summary:{count}')

# Try Coreference Resolution

In [20]:
from stanfordcorenlp import StanfordCoreNLP
import json

nlp = StanfordCoreNLP('./stanford-corenlp-full-2018-10-05/')

props = {'annotators': 'coref', 'pipelineLanguage': 'en'}

text = 'Barack Obama was born in Hawaii.  He is the president of that island. Obama was elected in 2008.'
result = json.loads(nlp.annotate(text, properties=props))

mentions = result['corefs']

for paragraph in parser.document.paragraphs:
    for sentence in paragraph.sentences:
        result = json.loads(nlp.annotate(sentence._text, properties=props))
        mentions = result['corefs']
        if mentions:
            for mention in mentions:
                for item in mentions[mention]:
                    print(item['text'])
            print()

# Experiment with CNN/Dailymail Dataset

In [215]:
import os
import re
CNN_PATH = './cnn/'
DM_PATH = './dailymail/'
dataset = []
reference = []
#listdir = sorted(os.listdir(CNN_PATH))
listdir = sorted(os.listdir(DM_PATH))

In [216]:
def clean_text(text):
    # acronym
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"what\'s", "what is", text)
    text = re.sub(r"What\'s", "what is", text)
    text = re.sub(r"\'ve ", " have ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"I\'m", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)

    # punctuation
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"-", " - ", text)
    text = re.sub(r"/", " / ", text)
    text = re.sub(r"\\", " \ ", text)
    text = re.sub(r"=", " = ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\.", " . ", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\"", " \" ", text)
    text = re.sub(r"&", " & ", text)
    text = re.sub(r"\|", " | ", text)
    text = re.sub(r";", " ; ", text)
    text = re.sub(r"\(", " ( ", text)
    text = re.sub(r"\)", " ) ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\"", " ", text)

    # remove extra space
    text = ' '.join(text.split())

    return text

In [217]:
for filename in listdir:
    if filename[-1] == 't':
        with open(DM_PATH+filename) as f:
            text = clean_text(f.read())
            dataset.append(text)
for filename in listdir:
    if filename[-1] == 'm':
        with open(DM_PATH+filename) as f:
            text = clean_text(f.read())
            reference.append([[text]])

# LSA / TextRank

In [218]:
summary = []

stemmer = Stemmer(LANGUAGE)

# Using LSA Summarization algorithm
summarizer = LsaSummarizer(stemmer)

# Using TextRank Summarization algorithm
#summarizer = TextRankSummarizer(stemmer)

summarizer.stop_words = get_stop_words(LANGUAGE)

for text in dataset[:1000]:
    parser = PlaintextParser.from_string(text,Tokenizer(LANGUAGE))
    summ = []
    for sentence in summarizer(parser.document, 3):
        summ.append(str(sentence))
    summary.append(summ)  

# Considering NER in summarization

In [None]:
summaries = []

stemmer = Stemmer(LANGUAGE)

# Using LSA Summarization algorithm
#summarizer = LsaSummarizer(stemmer)

# Using TextRank Summarization algorithm
summarizer = TextRankSummarizer(stemmer)

summarizer.stop_words = get_stop_words(LANGUAGE)

for text in dataset:
    parser = PlaintextParser.from_string(text,Tokenizer(LANGUAGE))
    summ = []
    for sentence in summarizer(parser.document, 5):
        summ.append(sentence)
    summaries.append(summ)  

jar = './stanford-ner-tagger/stanford-ner.jar'
model = './stanford-ner-tagger/ner-model-english.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
count = 1
summary_with_NER = []
for summary in summaries:
    print(count)
    count += 1
    summ_with_NER = []
    for sentence in summary: 
        words = nltk.word_tokenize(str(sentence))
        ner = ner_tagger.tag(words)
        for item in ner:
            if item[1] != 'O':
                summ_with_NER.append(str(sentence))
                break
    summary_with_NER.append([summ_with_NER])

# Considering Keywords in Summarization

In [None]:
import RAKE
import operator
import re
sample_stop = open("SmartStoplist.txt", 'r')
stop = sample_stop.read()

#deleting the nextlines and putting them into list
stopwordlist = RAKE.load_stop_words("SmartStoplist.txt", '[\n]+')

stopwordpattern = RAKE.build_stop_word_regex(stopwordlist)

summaries = []

stemmer = Stemmer(LANGUAGE)

# Using LSA Summarization algorithm
#summarizer = LsaSummarizer(stemmer)

# Using TextRank Summarization algorithm
summarizer = TextRankSummarizer(stemmer)

summarizer.stop_words = get_stop_words(LANGUAGE)

for text in dataset:
    parser = PlaintextParser.from_string(text,Tokenizer(LANGUAGE))
    summ = []
    for sentence in summarizer(parser.document, 5):
        summ.append(sentence)
    summaries.append(summ)  

count = 1
summary_with_keywords = []
for (summary,text) in zip(summaries,dataset):
    
    sentenceList = RAKE.split_sentences(text)
    words = RAKE.generate_candidate_keywords(sentenceList, stopwordpattern, minCharacters=1, maxWords=1)
    word_scores = RAKE.calculate_word_scores(words)
    keywordcandidates = RAKE.generate_candidate_keyword_scores(words, word_scores, minFrequency = 1)
    sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)

    summ_with_keywords = []
    print(count)
    count += 1
    for sentence in summary:
        for keyword in sortedKeywords[0:10]:
            if keyword[0] in nltk.word_tokenize(str(sentence)):
                summ_with_keywords.append(str(sentence))
                break
    summary_with_keywords.append([summ_with_keywords])

# Evaluate with automatic evaluation metric - ROUGE

In [None]:
from pythonrouge.pythonrouge import Pythonrouge

rouge = Pythonrouge(summary_file_exist=False,
                    summary=summary, reference=reference[:1000],
                    n_gram=4, ROUGE_SU4=False, ROUGE_L=True,recall_only=True, 
                    stemming=True, stopwords=True,word_level=True,use_cf=False, 
                    cf=95, scoring_formula='average',p=0.5)
score = rouge.calc_score()
print(score)