Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROUGE-N added #69

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions nlgeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(), "METEOR"),
(Rouge(), "ROUGE_L"),
(Rouge(2), ["ROUGE_1", "ROUGE_2", "ROUGE_L"]),
(Cider(), "CIDEr")
]
for scorer, method in scorers:
Expand Down Expand Up @@ -98,7 +98,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(), "METEOR"),
(Rouge(), "ROUGE_L"),
(Rouge(2), ["ROUGE_1", "ROUGE_2", "ROUGE_L"]),
(Cider(), "CIDEr")
]
for scorer, method in scorers:
Expand Down Expand Up @@ -151,7 +151,7 @@ class NLGEval(object):
# Overlap
'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4',
'METEOR',
'ROUGE_L',
'ROUGE_1', 'ROUGE_2', 'ROUGE_L',
'CIDEr',

# Skip-thought
Expand Down Expand Up @@ -210,8 +210,18 @@ def load_scorers(self):

if 'METEOR' not in self.metrics_to_omit:
self.scorers.append((Meteor(), "METEOR"))

if 'ROUGE_L' not in self.metrics_to_omit:
self.scorers.append((Rouge(), "ROUGE_L"))
omit_rouge_i = False
for i in range(1, 2 + 1):
if 'ROUGE_{}'.format(i) in self.metrics_to_omit:
omit_rouge_i = True
if i > 1:
self.scorers.append((Rouge(i - 1), ['ROUGE_{}'.format(j) for j in range(1, i)] + ["ROUGE_L"]))
break
if not omit_rouge_i:
self.scorers.append((Rouge(2), ["ROUGE_1", "ROUGE_2", "ROUGE_L"]))

if 'CIDEr' not in self.metrics_to_omit:
self.scorers.append((Cider(), "CIDEr"))

Expand Down
99 changes: 94 additions & 5 deletions nlgeval/pycocoevalcap/rouge/rouge.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Author : Ramakrishna Vedantam <vrama91@vt.edu>

import numpy as np
import itertools
import pdb

def my_lcs(string, sub):
Expand All @@ -33,19 +34,97 @@ def my_lcs(string, sub):

return lengths[len(string)][len(sub)]

def _get_ngrams(n, text):
"""TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py

Calcualtes n-grams.
Args:
n: which n-grams to calculate
text: An array of tokens
Returns:
A set of n-grams
"""
ngram_set = set()
text_length = len(text)
max_index_ngram_start = text_length - n
for i in range(max_index_ngram_start + 1):
ngram_set.add(tuple(text[i:i + n]))
return ngram_set

def _split_into_words(sentences):
"""TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py

Splits multiple sentences into words and flattens the result"""
return list(itertools.chain(*[_.split(" ") for _ in sentences]))

def _get_word_ngrams(n, sentences):
"""TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py

Calculates word n-grams for multiple sentences.
"""
assert len(sentences) > 0
assert n > 0

words = _split_into_words(sentences)
return _get_ngrams(n, words)

def rouge_n(evaluated_sentences, reference_sentences, n=2):
""" TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py

Computes ROUGE-N of two text collections of sentences.
Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
papers/rouge-working-note-v1.3.1.pdf
Args:
evaluated_sentences: The sentences that have been picked by the summarizer
reference_sentences: The sentences from the referene set
n: Size of ngram. Defaults to 2.
Returns:
A tuple (f1, precision, recall) for ROUGE-N
Raises:
ValueError: raises exception if a param has len <= 0
"""
if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
raise ValueError("Collections must contain at least 1 sentence.")

evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
reference_ngrams = _get_word_ngrams(n, reference_sentences)
reference_count = len(reference_ngrams)
evaluated_count = len(evaluated_ngrams)

# Gets the overlapping ngrams between evaluated and reference
overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
overlapping_count = len(overlapping_ngrams)

# Handle edge case. This isn't mathematically correct, but it's good enough
if evaluated_count == 0:
precision = 0.0
else:
precision = overlapping_count / evaluated_count

if reference_count == 0:
recall = 0.0
else:
recall = overlapping_count / reference_count

f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

# return overlapping_count / reference_count
return f1_score, precision, recall

class Rouge():
'''
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set

'''
def __init__(self):
def __init__(self, n=2):
# vrama91: updated the value below based on discussion with Hovey
self.beta = 1.2
self._n = n

def calc_score(self, candidate, refs):
"""
Compute ROUGE-L score given one candidate and references for an image
:param candidate: str : candidate sentence to be evaluated
:param candidate: list of str : candidate sentence to be evaluated
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
:returns score: int (ROUGE-L score for the candidate evaluated against references)
"""
Expand All @@ -54,6 +133,12 @@ def calc_score(self, candidate, refs):
prec = []
rec = []

# Compute ROUGE-n scores
rouge_n_scores = []
for n in range(1, self._n + 1):
f_score, _, _ = rouge_n(candidate, refs, n)
rouge_n_scores.append(f_score)

# split into tokens
token_c = candidate[0].split(" ")

Expand All @@ -72,7 +157,7 @@ def calc_score(self, candidate, refs):
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
else:
score = 0.0
return score
return rouge_n_scores + [score]

def compute_score(self, gts, res):
"""
Expand All @@ -98,8 +183,12 @@ def compute_score(self, gts, res):
assert(type(ref) is list)
assert(len(ref) > 0)

average_score = np.mean(np.array(score))
return average_score, np.array(score)
score_type = []
for s_idx, s_type in enumerate(score[0]):
score_type.append([s[s_idx] for s in score])

average_score = [np.mean(np.array(s)) for s in score_type]
return average_score, [np.array(s) for s in score_type]

def method(self):
return "Rouge"
12 changes: 6 additions & 6 deletions nlgeval/tests/test_nlgeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_compute_metrics_oo(self):
self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5)
self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
self.assertEqual(11, len(scores))
self.assertEqual(13, len(scores))

scores = n.compute_metrics(ref_list=[
[
Expand Down Expand Up @@ -56,7 +56,7 @@ def test_compute_metrics_oo(self):
self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5)
self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5)
self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5)
self.assertEqual(11, len(scores))
self.assertEqual(13, len(scores))

# Non-ASCII tests.
scores = n.compute_individual_metrics(ref=["Test en français.",
Expand All @@ -73,7 +73,7 @@ def test_compute_metrics_oo(self):
self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5)
self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5)
self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5)
self.assertEqual(11, len(scores))
self.assertEqual(13, len(scores))

scores = n.compute_individual_metrics(ref=["テスト"],
hyp="テスト")
Expand All @@ -83,7 +83,7 @@ def test_compute_metrics_oo(self):
self.assertAlmostEqual(0.0, scores['CIDEr'], places=3)
self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3)
self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3)
self.assertEqual(11, len(scores))
self.assertEqual(13, len(scores))

def test_compute_metrics_omit(self):
n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilairty'])
Expand All @@ -99,7 +99,7 @@ def test_compute_metrics_omit(self):
self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
self.assertEqual(7, len(scores))
self.assertEqual(9, len(scores))

def test_compute_metrics(self):
# The example from the README.
Expand All @@ -118,4 +118,4 @@ def test_compute_metrics(self):
self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5)
self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5)
self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the contribution!
Would you add some tests for the value of the ROUGE metrics?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, but I will not :

  • The code added is from another repository, not my code
  • The score are slightly different with pyrouge.
  • The goal of this PR is to give a quick and approximate way to get ROUGE-N score. This should not be merged to main branch, but kept open here.
  • For a real ROUGE-N score, someone needs to add the official perl script ROUGE-155... I don't have time for this now :/

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other repo's code seems to be Apache-licensed, I'm not sure if we can merge it, particularly without including their license. I'm not too worried about slightly different values as long as we're clear about the methods used in the docs. Are you aware where differences might come from?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could at least test that the values are within some reasonable bounds.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried comparing the results with the results using the rouge package. But scores are different (and not only a few digits...). Not only ROUGE-N are different, but also the existing ROUGE-L.

This package is also Apache-licensed. Not sure if we can just use it without including their license (without modifying their code).

self.assertEqual(11, len(scores))
self.assertEqual(13, len(scores))