In [1]:
import pandas as pd
import numpy as np

In [2]:
# Install SentencePiece for subword tokenization and text normalization in NLP tasks.
!pip install sentencepiece

# Install rouge-score for evaluating the quality of text summarization and generation using the ROUGE metric.
!pip install rouge-score


# Upgrade or install Accelerate for optimizing numerical computations, leveraging hardware acceleration techniques.
!pip install accelerate -U

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=27799f9f31e49214758feddaa4de0625ab1aa183654dc041d2dfffdf1c16602b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting 

In [4]:
df_processed = pd.read_csv("/content/Preprocessed_summarized_Data.csv", encoding="utf-8", on_bad_lines="skip")


In [5]:
# Drop nulls
df_processed = df_processed.dropna()

# Reset the index after dropping null records
df_processed = df_processed.reset_index(drop=True)

In [6]:
# check if there are nulls
df_processed.isna().sum()

title      0
content    0
summary    0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_output, test_output= train_test_split(
    df_processed['content'], df_processed['summary'], test_size=0.1)  # 10% for validation

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model_fine_tuned = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
def convert_to_list(data):
  return data.tolist()

# convert the pandas series to list
train_input_list = convert_to_list(train_input)
test_input_list = convert_to_list(test_input)
train_output_list = convert_to_list(train_output)
test_output_list = convert_to_list(test_output)

In [10]:
# Tokenize input and output
train_input_vector = tokenizer(train_input_list, truncation=True, padding='longest', max_length=1024, return_tensors="pt")
test_input_vector = tokenizer(test_input_list, truncation=True, padding='longest', max_length=1024, return_tensors="pt")
train_output_vector = tokenizer(train_output_list, truncation=True, padding='longest', max_length=256, return_tensors="pt")
test_output_vector = tokenizer(test_output_list, truncation=True, padding='longest', max_length=256, return_tensors="pt")

In [11]:
import torch

class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SummaryDataset(train_input_vector, train_output_vector)
test_dataset = SummaryDataset(test_input_vector, test_output_vector)

In [12]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=600,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)
device = torch.device("cuda")
model_fine_tuned.to(device)

# Define trainer
trainer = Trainer(
    model=model_fine_tuned,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()


Step,Training Loss
10,8.4565
20,8.2395
30,8.2311
40,7.8809
50,7.7186
60,7.6222
70,7.1579
80,6.8517
90,6.4568
100,5.8843


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()


TrainOutput(global_step=1200, training_loss=1.2093640654037396, metrics={'train_runtime': 79.1573, 'train_samples_per_second': 15.16, 'train_steps_per_second': 15.16, 'total_flos': 324820323532800.0, 'train_loss': 1.2093640654037396, 'epoch': 600.0})

In [13]:
results = trainer.evaluate()
print(results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx]).clone().detach()


{'eval_loss': 1.7069361209869385, 'eval_runtime': 0.04, 'eval_samples_per_second': 49.959, 'eval_steps_per_second': 49.959, 'epoch': 600.0}


In [14]:
# import pickle
# import torch
# torch.save(model.state_dict(), '/content/drive/MyDrive/Project Data/T5/model_fine_tuned.pth')

In [15]:
def generate_summaries(model, tokenizer, texts, device):
    model.to(device)
    summaries = []

    for text in texts:
        encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
        # encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

        # generated_ids = model.generate(encoded_input['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        generated_ids = model.generate(encoded_input['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

In [16]:
reference_summaries = [tokenizer.decode(labels, skip_special_tokens=True) for labels in test_output_vector['input_ids']]
generated_summaries = generate_summaries(model_fine_tuned, tokenizer, test_input_list, device)

In [17]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, gen) for ref, gen in zip(reference_summaries, generated_summaries)]

In [18]:
average_scores = {
    'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
    'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
    'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
}

print("Average ROUGE Scores:", average_scores)

Average ROUGE Scores: {'rouge1': 0.5041149264270005, 'rouge2': 0.382714352271274, 'rougeL': 0.41582776239999203}


output_dir='./results',
    num_train_epochs=300,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10


Average ROUGE Scores: {'rouge1': 0.3447868437943897, 'rouge2': 0.10205764136222227, 'rougeL': 0.20290599148283378}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=450,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

Average ROUGE Scores: {'rouge1': 0.3414091880435084, 'rouge2': 0.09026412976736842, 'rougeL': 0.19948729516211314}

 output_dir='./results',
    num_train_epochs=600,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
    
Average ROUGE Scores: {'rouge1': 0.348289331529748, 'rouge2': 0.10268646350536016, 'rougeL': 0.20383393472217354}

In [19]:
texts_to_summarize = "Research in machine translation (MT) depends heavily on the evaluation of its results. Especially for the development of an MT system, an evaluation measure is needed which reliably assesses the quality of MT output. Such a measure will help analyze the strengths and weaknesses of different translation systems or different versions of the same system by comparing output at the sentence level. In most applications of MT, understandability for humans in terms of readability as well as semantical correctness should be the evaluation criterion. But as human evaluation is tedious and cost-intensive, automatic evaluation measures are used in most MT research tasks. A high correlation between these automatic evaluation measures and human evaluation is thus desirable. State-of-the-art measures such as BLEU (Papineni et al., 2002) or NIST (Doddington, 2002) aim at measuring the translation quality rather on the document level1 than on the level of single sentences. They are thus not well-suited for sentence-level evaluation. The introduction of smoothing (Lin and Och, 2004) solves this problem only partially. In this paper, we will present a new automatic error measure for MT ‚Äì the CDER ‚Äì which is designed for assessing MT quality on the sentence level. It is based on edit distance ‚Äì such as the well-known word error rate (WER) ‚Äì but allows for reordering of blocks. Nevertheless, by defining reordering costs, the ordering of the words in a sentence is still relevant for the measure. In this, the new measure differs significantly from the position independent error rate (PER) by (Tillmann et al., 1997). Generally, finding an optimal solution for such a reordering problem is NP hard, as is shown in (Lopresti and Tomkins, 1997). In previous work, researchers have tried to reduce the complexity, for example by restricting the possible permutations on the block-level, or by approximation or heuristics during the calculation. Nevertheless, most of the resulting algorithms still have high run times and are hardly applied in practice, or give only a rough approximation. An overview of some better-known measures can be found in Section 3.1. In contrast to this, our new measure can be calculated very efficiently. This is achieved by requiring complete and disjoint coverage of the blocks only for the reference sentence, and not for the candidate translation. We will present an algorithm which computes the new error measure in quadratic time. The new evaluation measure will be investigated and compared to state-of-the-art methods on two translation tasks. The correlation with human assessment will be measured for several different statistical MT systems. We will see that the new measure significantly outperforms the existing approaches. As a further improvement, we will introduce word dependent substitution costs. This method will be applicable to the new measure as well as to established measures like WER and PER. Starting from the observation that the substitution of a word with a similar one is likely to affect translation quality less than the substitution with a completely different word, we will show how the similarity of words can be accounted for in automatic evaluation measures. This paper is organized as follows: In Section 2, we will present the state of the art in MT evaluation and discuss the problem of block reordering. Section 3 will introduce the new error measure CDER and will show how it can be calculated efficiently. The concept of worddependent substitution costs will be explained in Section 4. In Section 5, experimental results on the correlation of human judgment with the CDER and other well-known evaluation measures will be presented. Section 6 will conclude the paper and give an outlook on possible future work. In MT ‚Äì as opposed to other natural language processing tasks like speech recognition ‚Äì there is usually more than one correct outcome of a task. In many cases, alternative translations of a sentence differ from each other mostly by the ordering of blocks of words. Consequently, an evaluation measure for MT should be able to detect and allow for block reordering. Nevertheless, a higher ‚Äúamount‚Äù of reordering between a candidate translation and a reference translation should still be reflected in a worse evaluation score. In other words, the more blocks there are to be reordered between reference and candidate sentence, the higher we want the measure to evaluate the distance between these sentences. State-of-the-art evaluation measures for MT penalize movement of blocks rather severely: ngram based scores such as BLEU or NIST still yield a high unigram precision if blocks are reordered. For higher-order n-grams, though, the precision drops. As a consequence, this affects the overall score significantly. WER, which is based on Levenshtein distance, penalizes the reordering of blocks even more heavily. It measures the distance by substitution, deletion and insertion operations for each word in a relocated block. PER, on the other hand, ignores the ordering of the words in the sentences completely. This often leads to an overly optimistic assessment of translation quality. The approach we pursue in this paper is to extend the Levenshtein distance by an additional operation, namely block movement. The number of blocks in a sentence is equal to the number of gaps among the blocks plus one. Thus, the block movements can equivalently be expressed as long jump operations that jump over the gaps between two blocks. The costs of a long jump are constant. The blocks are read in the order of one of the sentences. These long jumps are combined with the ‚Äúclassical‚Äù Levenshtein edit operations, namely insertion, deletion, substitution, and the zero-cost operation identity. The resulting long jump distance dLJ gives the minimum number of operations which are necessary to transform the candidate sentence into the reference sentence. Like the Levenshtein distance, the long jump distance can be depicted using an alignment grid as shown in Figure 1: Here, each grid point corresponds to a pair of inter-word positions in candidate and reference sentence, respectively. dLJ is the minimum cost of a path between the lower left (first) and the upper right (last) alignment grid point which covers all reference and candidate words. Deletions and insertions correspond to horizontal and vertical edges, respectively. Substitutions and identity operations correspond to diagonal edges. Edges between arbitrary grid points from the same row correspond to long jump operations. It is easy to see that dLJ is symmetrical. In the example, the best path contains one deletion edge, one substitution edge, and three long jump edges. Therefore, the long jump distance between the sentences is five. In contrast, the best Levenshtein path contains one deletion edge, four identity and five consecutive substitution edges; the Levenshtein distance between the two sentences is six. The effect of reordering on the BLEU measure is even higher in this example: Whereas 8 of the 10 unigrams from the candidate sentence can be found in the reference sentence, this holds for only 4 bigrams, and 1 trigram. Not a single one of the 7 candidate four-grams occurs in the reference sentence. (Lopresti and Tomkins, 1997) showed that finding an optimal path in a long jump alignment grid is an NP-hard problem. Our experiments showed that the calculation of exact long jump distances becomes impractical for sentences longer than 20 words. A possible way to achieve polynomial runtime is to restrict the number of admissible block permutations. This has been implemented by (Leusch et al., 2003) in the inversion word error rate. Alternatively, a heuristic or approximative distance can be calculated, as in GTM by (Turian et al., 2003). An implementation of both approaches at the same time can be found in TER by (Snover et al., 2005). In this paper, we will present another approach which has a suitable run-time, while still maintaining completeness of the calculated measure. The idea of the proposed method is to drop some restrictions on the alignment path. The long jump distance as well as the Levenshtein distance require both reference and candidate translation to be covered completely and disjointly. When extending the metric by block movements, we drop this constraint for the candidate translation. That is, only the words in the reference sentence have to be covered exactly once, whereas those in the candidate sentence can be covered zero, one, or multiple times. Dropping the constraints makes an efficient computation of the distance possible. We drop the constraints for the candidate sentence and not for the reference sentence because we do not want any information contained in the reference to be omitted. Moreover, the reference translation will not contain unnecessary repetitions of blocks. The new measure ‚Äì which will be called CDER in the following ‚Äì can thus be seen as a measure oriented towards recall, while measures like BLEU are guided by precision. The CDER is based on the CDCD distance2 introduced in (Lopresti and Tomkins, 1997). The authors show there that the problem of finding the optimal solution can be solved in O(I2 ¬∑ L) time, where I is the length of the candidate sentence and L the length of the reference sentence. Within this paper, we will refer to this distance as dCD . In the next subsection, we will show how it can be computed in O(I ¬∑ L) time using a modification of the Levenshtein algorithm. We also studied the reverse direction of the described measure; that is, we dropped the coverage constraints for the reference sentence instead of the candidate sentence. Additionally, the maximum of both directions has been considered as distance measure. The results in Section 5.2 will show that the measure using the originally proposed direction has a significantly higher correlation with human evaluation \n than the other directions. Our algorithm for calculating dCD is based on the dynamic programming algorithm for the Levenshtein distance (Levenshtein, 1966). The Levenshtein distance dLev(eI1, ÀúeL ÔøΩ between two strings eI1 and ÀúeL1 can be calculated in constant time if the Levenshtein distances of the substrings, dLev(eI‚àí1 is stored in an I x L table. This auxiliary quantity can then be calculated recursively from DLev(i ‚àí 1, l), DLev(i, l ‚àí 1), and DLev(i ‚àí 1, l ‚àí 1). Consequently, the Levenshtein distance can be calculated in time O(I ¬∑ L). This algorithm can easily be extended for the calculation of dCD as follows: Again we define an auxiliary quantity D(i, l) as Insertions, deletions, and substitutions are handled the same way as in the Levenshtein algorithm. Now assume that an optimal dCD path has been found: Then, each long jump edge within 2C stands for cover and D for disjoint. We adopted this notion for our measures. this path will always start at a node with the lowest D value in its row3. Consequently, we use the following modification of the Levenshtein recursion: where Œ¥ is the Kronecker delta. Figure 2 shows the possible predecessors of a grid point. The calculation of D(i, l) requires all values of D(i', l) to be known, even for i' > i. Thus, the calculation takes three steps for each l: i0 There is always an optimal dCD alignment path that does not contain any deletion edges, because each deletion can be replaced by a long jump, at the same costs. This is different for a dLJ path, because here each candidate word must be covered exactly once. Assume now that the candidate sentence consists of I words and the reference sentence consists of L words, with I > L. Then, at most L candidate words can be covered by substitution or identity edges. Therefore, the remaining candidate words (at least I ‚àí L) must be covered by deletion edges. This means that at least I ‚àíL deletion edges will be found in any dLJ path, which leads to dLJ ‚àí dCD ‚â• I ‚àí L in this case. Consequently, the length difference between the two sentences gives us a useful miscoverage penalty lplen: This penalty is independent of the dCD alignment path. Thus, an optimal dCD alignment path is optimal for dCD + lplen as well. Therefore the search algorithm in Section 3.2 will find the optimum for this sum. Absolute Miscoverage Let coverage(i) be the number of substitution, identity, and deletion edges that cover a candidate word ei in a dCD path. If we had a complete and disjoint alignment for the candidate word (i.e., a dLJ path), coverage(i) would be 1 for each i. In general this is not the case. We can use the absolute miscoverage as a penalty lpmisc for dCD: Each of these steps can be done in time O(I). Therefore, this algorithm calculates dCD in time O(I ¬∑ L) and space O(I). As the CDER does not penalize candidate translations which are too long, we studied the use of a length penalty or miscoverage penalty. This determines the difference in sentence lengths between candidate and reference. Two definitions of such a penalty have been studied for this work. This miscoverage penalty is not independent of the alignment path. Consequently, the proposed search algorithm will not necessarily find an optimal solution for the sum of dCD and lpmisc. The idea behind the absolute miscoverage is that one can construct a valid ‚Äì but not necessarily optimal ‚Äì dLJ path from a given dCD path. This procedure is illustrated in Figure 3 and takes place in two steps: 1. For each block of over-covered candidate words, replace the aligned substitution and/or identity edges by insertion edges; move the long jump at the beginning of the block accordingly. 2. For each block of under-covered candidate words, add the corresponding number of deletion edges; move the long jump at the beginning of the block accordingly. This also shows that there cannot be4 a polynomial time algorithm that calculates the minimum of dCD + lpmisc for arbitrary pairs of sentences, because this minimum is equal to dLJ. With these miscoverage penalties, inexpensive lower and upper bounds for dLJ can be calculated, because the following inequality holds: All automatic error measures which are based on the edit distance (i.e. WER, PER, and CDER) apply fixed costs for the substitution of words. However, this is counter-intuitive, as replacing a word with another one which has a similar meaning will rarely change the meaning of a sentence significantly. On the other hand, replacing the same word with a completely different one probably will. Therefore, it seems advisable to make substitution costs dependent on the semantical and/or syntactical dissimilarity of the words. To avoid awkward case distinctions, we assume that a substitution cost function cSUB for two words e, eÀú meets the following requirements: 3. The costs of substituting a word e by eÀú are always equal or lower than those of deleting e and then inserting Àúe. In short, cSUB ‚â§ 2. Under these conditions the algorithms for WER and CDER can easily be modified to use word-dependent substitution costs. For example, the only necessary modification in the CDER algorithm in Equation 1 is to replace 1 ‚àí Œ¥(e, Àúe) by cSUB(e, Àúe). For the PER, it is no longer possible to use a linear time algorithm in the general case. Instead, a modification of the Hungarian algorithm (Knuth, 1993) can be used. The question is now how to define the worddependent substitution costs. We have studied two different approaches. A pragmatic approach is to compare the spelling of the words to be substituted with each other. The more similar the spelling is, the more similar we consider the words to be, and the lower we want the substitution costs between them. In English, this works well with similar tenses of the same verb, or with genitives or plurals of the same noun. Nevertheless, a similar spelling is no guarantee for a similar meaning, because prefixes such as ‚Äúmis-‚Äù, ‚Äúin-‚Äù, or ‚Äúun-‚Äù can change the meaning of a word significantly. An obvious way of comparing the spelling is the Levenshtein distance. Here, words are compared on character level. To normalize this distance into a range from 0 (for identical words) to 1 (for completely different words), we divide the absolute distance by the length of the Levenshtein alignment path. Another character-based substitution cost function we studied is based on the common prefix length of both words. In English, different tenses of the same verb share the same prefix; which is usually the stem. The same holds for different cases, numbers and genders of most nouns and adjectives. However, it does not hold if verb prefixes are changed or removed. On the other hand, the common prefix length is sensitive to critical prefixes such as ‚Äúmis-‚Äù for the same reason. Consequently, the common prefix length, normalized by the average length of both words, gives a reasonable measure for the similarity of two words. To transform the normalized common prefix length into costs, this fraction is then subtracted from 1. More sophisticated methods could be considered for word-dependent substitution costs as well. Examples of such methods are the introduction of information weights as in the NIST measure or the comparison of stems or synonyms, as in METEOR (Banerjee and Lavie, 2005). The different evaluation measures were assessed experimentally on data from the Chinese‚ÄìEnglish and the Arabic‚ÄìEnglish task of the NIST 2004 evaluation workshop (Przybocki, 2004). In this evaluation campaign, 4460 and 1735 candidate translations, respectively, generated by different research MT systems were evaluated by human judges with regard to fluency and adequacy. Four reference translations are provided for each candidate translation. Detailed corpus statistics are listed in Table 2. For the experiments in this study, the candidate translations from these tasks were evaluated using different automatic evaluation measures. Pearson‚Äôs correlation coefficient r between automatic evaluation and the sum of fluency and adequacy was calculated. As it could be arguable whether Pearson‚Äôs r is meaningful for categorical data like human MT evaluation, we have also calculated Kendall‚Äôs correlation coefficient T. Because of the high number of samples (= sentences, 4460) versus the low number of categories (= outcomes of adequacy+fluency, 9), we calculated T separately for each source sentence. These experiments showed that Kendall‚Äôs T reflects the same tendencies as Pearson‚Äôs r regarding the ranking of the evaluation measures. But only the latter allows for an efficient calculation of confidence intervals. Consequently, figures of T are omitted in this paper. Due to the small number of samples for evaluation on system level (10 and 5, respectively), all correlation coefficients between automatic and human evaluation on system level are very close to 1. Therefore, they do not show any significant differences for the different evaluation measures. Additional experiments on data from the NIST 2002 and 2003 workshops and from the IWSLT 2004 evaluation workshop confirm the findings from the NIST 2004 experiments; for the sake of clarity they are not included here. All correlation coefficients presented here were calculated for sentence level evaluation. For comparison with state-of-the-art evaluation measures, we have also calculated the correlation between human evaluation and WER and BLEU, which were both measures of choice in several international MT evaluation campaigns. Furthermore, we included TER (Snover et al., 2005) as a recent heuristic block movement measure in some of our experiments for comparison with our measure. As the BLEU score is unsuitable for sentence level evaluation in its original definition, BLEU-S smoothing as described by (Lin and Och, 2004) is performed. Additionally, we added sentence boundary symbols for BLEU, and a different reference length calculation scheme for \n TER, because these changes improved the correlation between human evaluation and the two automatic measures. Details on this have been described in (Leusch et al., 2005). Table 3 presents the correlation of BLEU, WER, and CDER with human assessment. It can be seen that CDER shows better correlation than BLEU and WER on both corpora. On the Chinese‚ÄìEnglish task, the smoothed BLEU score has a higher sentence-level correlation than WER. However, this is not the case for the Arabic‚Äì English task. So none of these two measures is superior to the other one, but they are both outperformed by CDER. If the direction of CDER is reversed (i.e, the CD constraints are required for the candidate instead of the reference, such that the measure has precision instead of recall characteristics), the correlation with human evaluation is much lower. Additionally we studied the use of the maximum of the distances in both directions. This has a lower correlation than taking the original CDER, as Table 3 shows. Nevertheless, the maximum still performs slightly better than BLEU and WER. The problem of how to avoid a preference of overly long candidate sentences by CDER remains unsolved, as can be found in Table 4: Each of the proposed penalties infers a significant decrease of correlation between the (extended) CDER and human evaluation. Future research will aim at finding a suitable length penalty. Especially if CDER is applied in system development, such a penalty will be needed, as preliminary optimization experiments have shown. WER: the correlation with human judgment is increased by about 2% absolute on both language pairs. The Levenshtein-based substitution costs are better suited for WER than the scheme based on common prefix length. For CDER, there is hardly any difference between the two methods. Experiments on five more corpora did not give any significant evidence which of the two substitution costs correlates better with human evaluation. But as the prefix-based substitution costs improved correlation more consistently across all corpora, we employed this method in our next experiment. An interesting topic in MT evaluation research is the question whether a linear combination of two MT evaluation measures can improve the correlation between automatic and human evaluation. Particularly, we expected the combination of CDER and PER to have a significantly higher correlation with human evaluation than the measures alone. CDER (as opposed to PER) has the ability to reward correct local ordering, whereas PER (as opposed to CDER) penalizes overly long candidate sentences. The two measures were combined with linear interpolation. In order to determine the weights, we performed data analysis on seven different corpora. The result was consistent across all different data collections and language pairs: a linear combination of about 60% CDER and 40% PER has a significantly higher correlation with human evaluation than each of the measures alone. For the two corpora studied here, the results of the combination can be found in Table 6: On the Chinese‚ÄìEnglish task, there is an additional gain of more than 1% absolute in correlation over CDER alone. The combined error measure is the best method in both cases. The last line in Table 6 shows the 95%confidence interval for the correlation. We see that the new measure CDER, combined with PER, has a significantly higher correlation with human evaluation than the existing measures BLEU, TER, and WER on both corpora. We presented CDER, a new automatic evaluation measure for MT, which is based on edit distance extended by block movements. CDER allows for reordering blocks of words at constant cost. Unlike previous block movement measures, CDER can be exactly calculated in quadratic time. Experimental evaluation on two different translation tasks shows a significantly improved correlation with human judgment in comparison with state-of-the-art measures such as BLEU. Additionally, we showed how word-dependent substitution costs can be applied to enhance the new error measure as well as existing approaches. The highest correlation with human assessment was achieved through linear interpolation of the new CDER with PER. Future work will aim at finding a suitable length penalty for CDER. In addition, more sophisticated definitions of the word-dependent substitution costs will be investigated. Furthermore, it will be interesting to see how this new error measure affects system development: We expect it to allow for a better sentence-wise error analysis. For system optimization, preliminary experiments have shown the need for a suitable length penalty. This material is partly based upon work supported by the Defense Advanced Research Projects Agency (DARPA) under Contract No. HR001106-C-0023, and was partly funded by the European Union under the integrated project TC-STAR ‚Äì Technology and Corpora for Speech to Speech Translation"


In [20]:
inputs = tokenizer(texts_to_summarize, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
# Move inputs to the same device as the model
inputs = inputs.to(model_fine_tuned.device)

# Generate summaries
summary_ids = model_fine_tuned.generate(inputs['input_ids'], max_length=256, num_beams=4, early_stopping=True)

# Decode generated summaries back to text
summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]


for i, summary in enumerate(summaries):
    print(f"Summary {i+1}:\n{summary}\n")

Summary 1:
We will present a new automatic error measure for MT  the CDER  which is designed for assessing MT quality on the sentence level. It is based on edit distance  such as the well-known word error rate (WER)  but allows for reordering of blocks. In addition, by defining reordering costs, the ordering of the words in a sentence is still relevant for the measure. In this paper, we will present the state of the art in MT evaluation and discuss



In [20]:
We will present a new automatic error measure for MT  the CDER  which is designed for assessing MT quality on the sentence level.
It is based on edit distance  such as the well-known word error rate (WER)  but allows for reordering of blocks.
In addition, by defining reordering costs, the ordering of the words in a sentence is still relevant for the measure.
In this paper, we will present the state of the art in MT evaluation and discuss
