In [1]:
!pip install nltk
!pip install rouge-score
!pip install transformers
!pip install bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=95f300684f4ce1ba9b830fdfe9a2b30bdf3f35c2c9815dd58f2eb54478ebdb45
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [2]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import BERTScorer
import pprint
import math

# Ensure required NLTK data is available
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')


def scoreCalculator(referenceText,hypothesisText):
    if type(hypothesisText) != str:
      print("Empty string entered")
      hypothesisText = ""
    # Preprocess: Tokenize for BLEU and METEOR scores
    hypothesisText_tokens = word_tokenize(hypothesisText)
    referenceText_tokens = word_tokenize(referenceText)

    # Get BLEU score
    BLEU = sentence_bleu([referenceText_tokens], hypothesisText_tokens)

    # Get METEOR score
    METEOR = meteor_score([referenceText_tokens], hypothesisText_tokens)

    # Get ROUGE score
    ROUGEscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    ROUGE = ROUGEscorer.score(referenceText, hypothesisText)

    # Get BERT score
    BERTscorer = BERTScorer(model_type='bert-base-uncased', lang="en", rescale_with_baseline=True)
    BERT_P, BERT_R, BERT_F1 = BERTscorer.score([hypothesisText], [referenceText])

    # Return dict and print
    scoreDict = {
        "BLEU": round(BLEU, 2),
        "METEOR": round(METEOR, 2),
        "ROUGE1_precision": round(ROUGE["rouge1"].precision, 2),
        "ROUGE1_recall": round(ROUGE["rouge1"].recall, 2),
        "ROUGE1_F1": round(ROUGE["rouge1"].fmeasure, 2),
        "ROUGE2_precision": round(ROUGE["rouge2"].precision, 2),
        "ROUGE2_recall": round(ROUGE["rouge2"].recall, 2),
        "ROUGE2_F1": round(ROUGE["rouge2"].fmeasure, 2),
        "ROUGEL_precision": round(ROUGE["rougeL"].precision, 2),
        "ROUGEL_recall": round(ROUGE["rougeL"].recall, 2),
        "ROUGEL_F1": round(ROUGE["rougeL"].fmeasure, 2),
        "BERT_precision": round(BERT_P.item(), 2),
        "BERT_recall": round(BERT_R.item(), 2),
        "BERT_F1": round(BERT_F1.item(), 2),
    }

    print("Score Summary:")
    for key, value in scoreDict.items():
      print(f"{key}: {value}")

    return scoreDict


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
import pandas as pd

ES_hypothesis_df = pd.read_csv("summarization_BART_ES.csv")
ES_reference_df = pd.read_csv("csvFiles/papers_EStoEN.csv")

JP_hypothesis_df = pd.read_csv("summarization_BART_JP.csv")
JP_reference_df = pd.read_csv("csvFiles/papers_JPtoEN.csv")

In [None]:
ES_referenceText = ES_reference_df['contribution_translated']
JP_referenceText = JP_reference_df['contribution_translated']

ES_hypothesis_BART = ES_hypothesis_df['BART']
ES_hypothesis_BART_ft = ES_hypothesis_df['BART_ft']

JP_hypothesis_BART = JP_hypothesis_df['BART']
JP_hypothesis_BART_ft = JP_hypothesis_df['BART_ft']

In [None]:
def evaluator(referenceSeries,hypothesisSeries):
  assert len(referenceSeries) == len(hypothesisSeries)

  dfSeed = []
  for i in range(len(referenceSeries)):
    scores = scoreCalculator(referenceSeries[i],hypothesisSeries[i])
    dfSeed.append(scores)

  df = pd.DataFrame(dfSeed)

  return df

In [None]:
df_JP_BART = evaluator(JP_referenceText,JP_hypothesis_BART)
df_JP_BART_ft = evaluator(JP_referenceText,JP_hypothesis_BART_ft)

df_ES_BART = evaluator(ES_referenceText,ES_hypothesis_BART)
df_ES_BART_ft = evaluator(ES_referenceText,ES_hypothesis_BART_ft)


Score Summary:
BLEU: 0.1
METEOR: 0.3
ROUGE1_precision: 0.65
ROUGE1_recall: 0.38
ROUGE1_F1: 0.48
ROUGE2_precision: 0.31
ROUGE2_recall: 0.18
ROUGE2_F1: 0.22
ROUGEL_precision: 0.49
ROUGEL_recall: 0.29
ROUGEL_F1: 0.36
BERT_precision: 0.48
BERT_recall: 0.44
BERT_F1: 0.46


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.26
ROUGE1_precision: 0.25
ROUGE1_recall: 0.43
ROUGE1_F1: 0.32
ROUGE2_precision: 0.08
ROUGE2_recall: 0.14
ROUGE2_F1: 0.1
ROUGEL_precision: 0.18
ROUGEL_recall: 0.3
ROUGEL_F1: 0.22
BERT_precision: 0.27
BERT_recall: 0.39
BERT_F1: 0.33


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.12
ROUGE1_precision: 0.08
ROUGE1_recall: 0.3
ROUGE1_F1: 0.12
ROUGE2_precision: 0.01
ROUGE2_recall: 0.05
ROUGE2_F1: 0.02
ROUGEL_precision: 0.06
ROUGEL_recall: 0.26
ROUGEL_F1: 0.1
BERT_precision: 0.16
BERT_recall: 0.23
BERT_F1: 0.19


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.22
ROUGE1_precision: 0.32
ROUGE1_recall: 0.34
ROUGE1_F1: 0.33
ROUGE2_precision: 0.0
ROUGE2_recall: 0.0
ROUGE2_F1: 0.0
ROUGEL_precision: 0.19
ROUGEL_recall: 0.21
ROUGEL_F1: 0.2
BERT_precision: 0.31
BERT_recall: 0.37
BERT_F1: 0.34
Score Summary:
BLEU: 0.07
METEOR: 0.33
ROUGE1_precision: 0.25
ROUGE1_recall: 0.45
ROUGE1_F1: 0.33
ROUGE2_precision: 0.1
ROUGE2_recall: 0.18
ROUGE2_F1: 0.13
ROUGEL_precision: 0.22
ROUGEL_recall: 0.38
ROUGEL_F1: 0.28
BERT_precision: 0.33
BERT_recall: 0.46
BERT_F1: 0.39


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.11
ROUGE1_precision: 0.32
ROUGE1_recall: 0.23
ROUGE1_F1: 0.26
ROUGE2_precision: 0.11
ROUGE2_recall: 0.08
ROUGE2_F1: 0.09
ROUGEL_precision: 0.29
ROUGEL_recall: 0.2
ROUGEL_F1: 0.24
BERT_precision: 0.43
BERT_recall: 0.43
BERT_F1: 0.43
Score Summary:
BLEU: 0.1
METEOR: 0.52
ROUGE1_precision: 0.27
ROUGE1_recall: 0.7
ROUGE1_F1: 0.39
ROUGE2_precision: 0.16
ROUGE2_recall: 0.42
ROUGE2_F1: 0.23
ROUGEL_precision: 0.17
ROUGEL_recall: 0.45
ROUGEL_F1: 0.25
BERT_precision: 0.3
BERT_recall: 0.58
BERT_F1: 0.42
Score Summary:
BLEU: 0.08
METEOR: 0.33
ROUGE1_precision: 0.33
ROUGE1_recall: 0.48
ROUGE1_F1: 0.39
ROUGE2_precision: 0.12
ROUGE2_recall: 0.17
ROUGE2_F1: 0.14
ROUGEL_precision: 0.19
ROUGEL_recall: 0.28
ROUGEL_F1: 0.23
BERT_precision: 0.32
BERT_recall: 0.32
BERT_F1: 0.32


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.38
ROUGE1_precision: 0.28
ROUGE1_recall: 0.46
ROUGE1_F1: 0.35
ROUGE2_precision: 0.13
ROUGE2_recall: 0.22
ROUGE2_F1: 0.16
ROUGEL_precision: 0.21
ROUGEL_recall: 0.33
ROUGEL_F1: 0.25
BERT_precision: 0.34
BERT_recall: 0.39
BERT_F1: 0.36


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.19
ROUGE1_precision: 0.14
ROUGE1_recall: 0.26
ROUGE1_F1: 0.18
ROUGE2_precision: 0.02
ROUGE2_recall: 0.04
ROUGE2_F1: 0.03
ROUGEL_precision: 0.1
ROUGEL_recall: 0.19
ROUGEL_F1: 0.13
BERT_precision: 0.23
BERT_recall: 0.2
BERT_F1: 0.22
Score Summary:
BLEU: 0.43
METEOR: 0.72
ROUGE1_precision: 0.82
ROUGE1_recall: 0.71
ROUGE1_F1: 0.76
ROUGE2_precision: 0.57
ROUGE2_recall: 0.5
ROUGE2_F1: 0.53
ROUGEL_precision: 0.76
ROUGEL_recall: 0.67
ROUGEL_F1: 0.71
BERT_precision: 0.78
BERT_recall: 0.76
BERT_F1: 0.77


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.25
ROUGE1_precision: 0.21
ROUGE1_recall: 0.47
ROUGE1_F1: 0.29
ROUGE2_precision: 0.08
ROUGE2_recall: 0.17
ROUGE2_F1: 0.11
ROUGEL_precision: 0.16
ROUGEL_recall: 0.37
ROUGEL_F1: 0.23
BERT_precision: 0.24
BERT_recall: 0.36
BERT_F1: 0.3


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.19
ROUGE1_precision: 0.15
ROUGE1_recall: 0.3
ROUGE1_F1: 0.2
ROUGE2_precision: 0.02
ROUGE2_recall: 0.05
ROUGE2_F1: 0.03
ROUGEL_precision: 0.11
ROUGEL_recall: 0.22
ROUGEL_F1: 0.14
BERT_precision: 0.2
BERT_recall: 0.2
BERT_F1: 0.2
Score Summary:
BLEU: 0.16
METEOR: 0.47
ROUGE1_precision: 0.38
ROUGE1_recall: 0.45
ROUGE1_F1: 0.41
ROUGE2_precision: 0.15
ROUGE2_recall: 0.18
ROUGE2_F1: 0.16
ROUGEL_precision: 0.24
ROUGEL_recall: 0.28
ROUGEL_F1: 0.25
BERT_precision: 0.49
BERT_recall: 0.51
BERT_F1: 0.5
Score Summary:
BLEU: 0.18
METEOR: 0.44
ROUGE1_precision: 0.63
ROUGE1_recall: 0.41
ROUGE1_F1: 0.5
ROUGE2_precision: 0.39
ROUGE2_recall: 0.25
ROUGE2_F1: 0.3
ROUGEL_precision: 0.63
ROUGEL_recall: 0.41
ROUGEL_F1: 0.5
BERT_precision: 0.59
BERT_recall: 0.51
BERT_F1: 0.55


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.16
ROUGE1_precision: 0.33
ROUGE1_recall: 0.28
ROUGE1_F1: 0.3
ROUGE2_precision: 0.09
ROUGE2_recall: 0.08
ROUGE2_F1: 0.08
ROUGEL_precision: 0.24
ROUGEL_recall: 0.2
ROUGEL_F1: 0.22
BERT_precision: 0.39
BERT_recall: 0.43
BERT_F1: 0.41


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.23
ROUGE1_precision: 0.17
ROUGE1_recall: 0.35
ROUGE1_F1: 0.23
ROUGE2_precision: 0.05
ROUGE2_recall: 0.11
ROUGE2_F1: 0.07
ROUGEL_precision: 0.1
ROUGEL_recall: 0.2
ROUGEL_F1: 0.13
BERT_precision: 0.18
BERT_recall: 0.27
BERT_F1: 0.23


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.25
ROUGE1_precision: 0.49
ROUGE1_recall: 0.35
ROUGE1_F1: 0.41
ROUGE2_precision: 0.08
ROUGE2_recall: 0.06
ROUGE2_F1: 0.07
ROUGEL_precision: 0.23
ROUGEL_recall: 0.17
ROUGEL_F1: 0.19
BERT_precision: 0.38
BERT_recall: 0.28
BERT_F1: 0.33


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.19
ROUGE1_precision: 0.23
ROUGE1_recall: 0.25
ROUGE1_F1: 0.24
ROUGE2_precision: 0.04
ROUGE2_recall: 0.04
ROUGE2_F1: 0.04
ROUGEL_precision: 0.15
ROUGEL_recall: 0.17
ROUGEL_F1: 0.16
BERT_precision: 0.25
BERT_recall: 0.35
BERT_F1: 0.3


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.37
ROUGE1_precision: 0.39
ROUGE1_recall: 0.44
ROUGE1_F1: 0.41
ROUGE2_precision: 0.23
ROUGE2_recall: 0.27
ROUGE2_F1: 0.25
ROUGEL_precision: 0.26
ROUGEL_recall: 0.3
ROUGEL_F1: 0.28
BERT_precision: 0.52
BERT_recall: 0.6
BERT_F1: 0.56


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.18
ROUGE1_precision: 0.22
ROUGE1_recall: 0.24
ROUGE1_F1: 0.23
ROUGE2_precision: 0.0
ROUGE2_recall: 0.0
ROUGE2_F1: 0.0
ROUGEL_precision: 0.17
ROUGEL_recall: 0.18
ROUGEL_F1: 0.18
BERT_precision: 0.25
BERT_recall: 0.22
BERT_F1: 0.23


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.14
ROUGE1_precision: 0.41
ROUGE1_recall: 0.29
ROUGE1_F1: 0.34
ROUGE2_precision: 0.08
ROUGE2_recall: 0.05
ROUGE2_F1: 0.06
ROUGEL_precision: 0.31
ROUGEL_recall: 0.21
ROUGEL_F1: 0.25
BERT_precision: 0.35
BERT_recall: 0.31
BERT_F1: 0.33
Score Summary:
BLEU: 0.06
METEOR: 0.21
ROUGE1_precision: 0.47
ROUGE1_recall: 0.31
ROUGE1_F1: 0.38
ROUGE2_precision: 0.08
ROUGE2_recall: 0.05
ROUGE2_F1: 0.06
ROUGEL_precision: 0.32
ROUGEL_recall: 0.21
ROUGEL_F1: 0.25
BERT_precision: 0.41
BERT_recall: 0.29
BERT_F1: 0.34
Score Summary:
BLEU: 0.28
METEOR: 0.68
ROUGE1_precision: 0.41
ROUGE1_recall: 0.79
ROUGE1_F1: 0.54
ROUGE2_precision: 0.33
ROUGE2_recall: 0.67
ROUGE2_F1: 0.44
ROUGEL_precision: 0.35
ROUGEL_recall: 0.68
ROUGEL_F1: 0.46
BERT_precision: 0.49
BERT_recall: 0.75
BERT_F1: 0.61


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.38
ROUGE1_precision: 0.38
ROUGE1_recall: 0.44
ROUGE1_F1: 0.41
ROUGE2_precision: 0.08
ROUGE2_recall: 0.09
ROUGE2_F1: 0.08
ROUGEL_precision: 0.23
ROUGEL_recall: 0.26
ROUGEL_F1: 0.24
BERT_precision: 0.3
BERT_recall: 0.34
BERT_F1: 0.32


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.24
ROUGE1_precision: 0.3
ROUGE1_recall: 0.38
ROUGE1_F1: 0.33
ROUGE2_precision: 0.04
ROUGE2_recall: 0.05
ROUGE2_F1: 0.04
ROUGEL_precision: 0.21
ROUGEL_recall: 0.27
ROUGEL_F1: 0.24
BERT_precision: 0.32
BERT_recall: 0.33
BERT_F1: 0.32


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.22
ROUGE1_precision: 0.35
ROUGE1_recall: 0.39
ROUGE1_F1: 0.37
ROUGE2_precision: 0.08
ROUGE2_recall: 0.09
ROUGE2_F1: 0.09
ROUGEL_precision: 0.18
ROUGEL_recall: 0.2
ROUGEL_F1: 0.19
BERT_precision: 0.4
BERT_recall: 0.45
BERT_F1: 0.43
Score Summary:
BLEU: 0.04
METEOR: 0.16
ROUGE1_precision: 0.5
ROUGE1_recall: 0.25
ROUGE1_F1: 0.33
ROUGE2_precision: 0.1
ROUGE2_recall: 0.05
ROUGE2_F1: 0.07
ROUGEL_precision: 0.33
ROUGEL_recall: 0.17
ROUGEL_F1: 0.22
BERT_precision: 0.39
BERT_recall: 0.33
BERT_F1: 0.36
Score Summary:
BLEU: 0.14
METEOR: 0.5
ROUGE1_precision: 0.35
ROUGE1_recall: 0.75
ROUGE1_F1: 0.48
ROUGE2_precision: 0.2
ROUGE2_recall: 0.43
ROUGE2_F1: 0.27
ROUGEL_precision: 0.25
ROUGEL_recall: 0.54
ROUGEL_F1: 0.35
BERT_precision: 0.42
BERT_recall: 0.56
BERT_F1: 0.48
Score Summary:
BLEU: 0.05
METEOR: 0.27
ROUGE1_precision: 0.31
ROUGE1_recall: 0.37
ROUGE1_F1: 0.34
ROUGE2_precision: 0.09
ROUGE2_recall: 0.1
ROUGE2_F1: 0.1
ROUGEL_precision: 0.14
ROUGEL_recall: 0.16
RO

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.11
ROUGE1_precision: 0.33
ROUGE1_recall: 0.21
ROUGE1_F1: 0.26
ROUGE2_precision: 0.03
ROUGE2_recall: 0.02
ROUGE2_F1: 0.02
ROUGEL_precision: 0.22
ROUGEL_recall: 0.14
ROUGEL_F1: 0.17
BERT_precision: 0.31
BERT_recall: 0.24
BERT_F1: 0.28
Score Summary:
BLEU: 0.04
METEOR: 0.22
ROUGE1_precision: 0.62
ROUGE1_recall: 0.31
ROUGE1_F1: 0.41
ROUGE2_precision: 0.11
ROUGE2_recall: 0.05
ROUGE2_F1: 0.07
ROUGEL_precision: 0.41
ROUGEL_recall: 0.21
ROUGEL_F1: 0.28
BERT_precision: 0.38
BERT_recall: 0.27
BERT_F1: 0.33
Score Summary:
BLEU: 0.28
METEOR: 0.68
ROUGE1_precision: 0.45
ROUGE1_recall: 0.74
ROUGE1_F1: 0.56
ROUGE2_precision: 0.37
ROUGE2_recall: 0.61
ROUGE2_F1: 0.46
ROUGEL_precision: 0.29
ROUGEL_recall: 0.47
ROUGEL_F1: 0.36
BERT_precision: 0.3
BERT_recall: 0.6
BERT_F1: 0.43


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.19
ROUGE1_precision: 0.27
ROUGE1_recall: 0.26
ROUGE1_F1: 0.27
ROUGE2_precision: 0.03
ROUGE2_recall: 0.03
ROUGE2_F1: 0.03
ROUGEL_precision: 0.15
ROUGEL_recall: 0.15
ROUGEL_F1: 0.15
BERT_precision: 0.32
BERT_recall: 0.31
BERT_F1: 0.32


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.14
ROUGE1_precision: 0.27
ROUGE1_recall: 0.24
ROUGE1_F1: 0.26
ROUGE2_precision: 0.0
ROUGE2_recall: 0.0
ROUGE2_F1: 0.0
ROUGEL_precision: 0.12
ROUGEL_recall: 0.11
ROUGEL_F1: 0.12
BERT_precision: 0.23
BERT_recall: 0.21
BERT_F1: 0.22


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.21
ROUGE1_precision: 0.39
ROUGE1_recall: 0.27
ROUGE1_F1: 0.32
ROUGE2_precision: 0.1
ROUGE2_recall: 0.07
ROUGE2_F1: 0.08
ROUGEL_precision: 0.26
ROUGEL_recall: 0.18
ROUGEL_F1: 0.21
BERT_precision: 0.42
BERT_recall: 0.36
BERT_F1: 0.39


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.14
ROUGE1_precision: 0.44
ROUGE1_recall: 0.23
ROUGE1_F1: 0.3
ROUGE2_precision: 0.03
ROUGE2_recall: 0.02
ROUGE2_F1: 0.02
ROUGEL_precision: 0.22
ROUGEL_recall: 0.12
ROUGEL_F1: 0.15
BERT_precision: 0.34
BERT_recall: 0.32
BERT_F1: 0.33
Score Summary:
BLEU: 0.17
METEOR: 0.45
ROUGE1_precision: 0.38
ROUGE1_recall: 0.62
ROUGE1_F1: 0.47
ROUGE2_precision: 0.21
ROUGE2_recall: 0.35
ROUGE2_F1: 0.26
ROUGEL_precision: 0.28
ROUGEL_recall: 0.46
ROUGEL_F1: 0.34
BERT_precision: 0.44
BERT_recall: 0.54
BERT_F1: 0.49


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Score Summary:
BLEU: 0.0
METEOR: 0.19
ROUGE1_precision: 0.42
ROUGE1_recall: 0.22
ROUGE1_F1: 0.29
ROUGE2_precision: 0.08
ROUGE2_recall: 0.04
ROUGE2_F1: 0.05
ROUGEL_precision: 0.19
ROUGEL_recall: 0.1
ROUGEL_F1: 0.13
BERT_precision: 0.34
BERT_recall: 0.18
BERT_F1: 0.25


In [None]:
df_JP_BART.to_csv("scores_JP_BART.csv")
df_JP_BART_ft.to_csv("scores_JP_BART_ft.csv")
df_ES_BART.to_csv("scores_ES_BART.csv")
df_ES_BART_ft.to_csv("scores_ES_BART_ft.csv")
