-
Notifications
You must be signed in to change notification settings - Fork 0
/
summary.py
107 lines (88 loc) · 3.31 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
import os
# from __future__ import absolute_import
# from __future__ import division, print_function, unicode_literals
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
# from sumy.summarizers.edmundson import EdmundsonSummarizer as Summarizer
from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
# from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.evaluation.rouge import rouge_n, rouge_l_sentence_level
LANGUAGE = "english"
SENTENCES_COUNT = 4
SPLIT = 20
def evaluate(geneSen, refSen):
# Rouge 1, 2, L
return rouge_n(geneSen, refSen, 1), rouge_n(geneSen, refSen, 2), rouge_l_sentence_level(geneSen, refSen)
def _summ_score(storyName, highlightName):
parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
geneSen = summarizer(parser.document, SENTENCES_COUNT)
refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
#print geneSen
#print "=========="
#print refSen
try:
return evaluate(geneSen, refSen)
except Exception as e:
print storyName
print e
raise e
def _firstK_score(storyName, highlightName):
parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
geneSen = parser.document.sentences[:SENTENCES_COUNT]
refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences
# print geneSen
# print "=========="
# print refSen
# print evaluate(geneSen, refSen)
try:
return evaluate(geneSen, refSen)
except Exception as e:
print storyName
print e
raise e
def summarizer_score(method="summ"):
print method
for source in ["cnn", "dm"]:
r1 = 0
r2 = 0
rl = 0
count = 0
story_dir = "../stories/final_" + source + "/"
highlight_dir = "../stories/highlights_" + source + "/"
storyFiles = os.listdir(story_dir)
print len(storyFiles)/SPLIT
for i in range(len(storyFiles)/SPLIT):
# if i % 50 == 0:
# print i
storyName = storyFiles[i]
if not storyName.endswith(".story"):
continue
highlightName = storyFiles[i].strip("story") + "highlight.txt"
# print storyName, highlightName
try:
if method == "summ":
rouge = _summ_score(story_dir + storyName, highlight_dir + highlightName)
else:
rouge = _firstK_score(story_dir + storyName, highlight_dir + highlightName)
except KeyboardInterrupt:
raise
except:
continue
r1 += rouge[0]
r2 += rouge[1]
rl += rouge[2]
count += 1
print "Source = " + str(source)
print "Count = " + str(count)
print "Rouge-1 = " + str(r1/(count+0.0))
print "Rouge-2 = " + str(r2/(count+0.0))
print "Rouge-L = " + str(rl/(count+0.0))
if __name__ == "__main__":
summarizer_score()