
# Flash Cards



*   Muhammed Görkem KOLA - 2200765032 
*   İlkim İclal Aydoğan - 21992814


## Reading Data

In [None]:
!pip install transformers
!pip install datasets
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import the 'wandb' module and log in to the W&B server
import wandb
wandb.init(project="flash-cards", entity="colomar")

# Set the WANDB_PROJECT environment variable to 'flash-cards'
%env WANDB_PROJECT=flash-cards

env: WANDB_PROJECT=flash-cards


In [None]:
# Import the 'torch', 'datasets', and 'transformers' modules
import torch
from datasets import load_dataset, load_metric, list_metrics, Dataset, concatenate_datasets, Sequence, Features, Value
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollator, T5ForConditionalGeneration, T5TokenizerFast, TrainingArguments, Trainer
import json

# Import the 'Dict', 'List', and 'Optional' types from the 'typing' module
from typing import Dict, List, Optional

# Import the 'dataclasses' and 'field' modules
import dataclasses
from dataclasses import dataclass, field

# Import the 'logging', 'os', and 'sys' modules
import logging
import os
import sys

# Import the 'numpy', 'torch', and 'wandb' modules
import numpy as np
import torch
import wandb

# Import the 'notebook_login' function from the 'huggingface_hub' module
from huggingface_hub import notebook_login

# Import the 'drive' function from the 'google.cplab' package
from google.colab import drive

In [None]:
def read_data(path: str):
    """
    Read data in the SQuAD v2 format from a file and create a dataset object from it.
    Args:
    - path: the path to the file containing the data
    Returns:
    - A dataset object containing the data
    """
    # Check if the file exists and is readable
    if not os.path.isfile(path) or not os.access(path, os.R_OK):
        raise FileNotFoundError(f"File '{path}' not found or not readable")
    
    # Open the file and read the data as a JSON object
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
        data = data["data"]
        
        # Initialize an empty list to store the dictionaries representing the examples
        dicts = []
        
        # Iterate over the data points in the JSON object
        for dt in data:
            # Extract the 'paragraphs' and 'title' fields from the current data point
            paragraphs, title = dt.values()
            
            # Iterate over the paragraphs in the current data point
            for x in paragraphs[:5]:
                # Extract the 'context' and 'qas' fields from the current paragraph
                context, qas = x.values()
                
                # Iterate over the questions in the current paragraph, up to a maximum of 10
                for aiq in qas[:5]:
                    # Extract the 'answers', '_id', and 'question' fields from the current question
                    answers, _id, question = aiq.values()
                    
                    # Initialize a dictionary for storing the answers
                    ans = {
                        "answer_start":[],
                        "text":[],
                    }
                    
                    # Iterate over the answers and append the text and starting position of each answer to the corresponding lists in 'ans'
                    for a in answers:
                        ans["text"].append(a["text"])
                        ans["answer_start"].append(a["answer_start"])
                    
                    # Create a dictionary representing the current example and append it to the list of dictionaries
                    dicts.append({
                        "id":str(_id),
                        "title":title,
                        "context":context,
                        "question":question,
                        "answers": ans
                    })
        
        # Create a dataset object from the list of dictionaries using the 'from_list' method
        data = Dataset.from_list(dicts)
        return data

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change the current working directory to '/content/drive/MyDrive/dataset'
%cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [None]:
# Read the training data from the 'train.json' file and create a dataset object from it
Htrain_data = read_data("train.json")

# Read the validation data from the 'dev.json' file and create a dataset object from it
Hvalid_data = read_data("dev.json")

In [None]:
# Harvesting Train and Validation datasets
Htrain_data, Hvalid_data

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 128619
 }), Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 1666
 }))

In [None]:
# Concatenate the 'Strain_data' and 'Htrain_data' datasets and assign the resulting dataset to the 'train_dataset' variable
train_dataset = concatenate_datasets([Htrain_data])

# Concatenate the 'Svalid_data' and 'Hvalid_data' datasets and assign the resulting dataset to the 'valid_dataset' variable
valid_dataset = concatenate_datasets([Hvalid_data])

In [None]:
# Train and Validation datasets
train_dataset, valid_dataset

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 128619
 }), Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 1666
 }))

### TOKENIZE DATA

In [None]:
# Create a T5 model using the 't5-small' pre-trained model and assign it to the 'model' variable
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Create a T5 tokenizer using the 't5-small' pre-trained model and assign it to the 'tokenizer' variable
tokenizer = T5TokenizerFast.from_pretrained("t5-base")

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# Set the value of the 'sep_token' attribute of the tokenizer to '<sep>'
tokenizer.sep_token = '<sep>'

# Add the '<sep>' token to the vocabulary of the tokenizer
tokenizer.add_tokens(['<sep>'])

# Resize the token embeddings of the model to match the size of the vocabulary of the tokenizer
model.resize_token_embeddings(len(tokenizer))

Embedding(32101, 768)

In [None]:
def convert_to_features(example_batch):
    """
    Encode a batch of examples as input IDs and attention masks using a tokenizer.
    Args:
    - example_batch: a batch of examples, where each example is a dictionary with 'context' and 'question' fields
    Returns:
    - A dictionary with input IDs, attention masks, labels, and decoder attention masks
    """
    # Encode the 'context' field of each example in the batch using the tokenizer
    input_encodings = tokenizer.batch_encode_plus(example_batch['context'],
                                                  max_length=512,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)
    
    # Encode the 'question' field of each example in the batch using the tokenizer
    target_encodings = tokenizer.batch_encode_plus(example_batch['question'],
                                                   max_length=64,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    # Combine the input IDs and attention masks into a dictionary
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
    """
    Append the end-of-sequence token to the 'context' and 'question' fields of an example.
    Args:
    - example: an example, represented as a dictionary with 'context' and 'question' fields
    Returns:
    - The modified example
    """
    # Append the end-of-sequence token to the 'context' and 'question' fields
    example['context'] = example['context'] + " </s>"
    example['question'] = example['question'] + " </s>"
    return example

def add_special_tokens(example):
    """
    Replace the "{sep_token}" placeholder in the 'question' field of an example with the special token '<sep>'.
    Args:
    - example: an example, represented as a dictionary with a 'question' field
    Returns:
    - The modified example
    """
    # Replace the "{sep_token}" placeholder with the special token '<sep>'
    example['question'] = example['question'].replace("{sep_token}", '<sep>')
    return example

In [None]:
# Apply the add_eos_examples function to each example in the train_dataset,
# modifying the examples by adding the "</s>" suffix to the "context" and "question" fields
train_dataset = train_dataset.map(add_eos_examples)

# Apply the add_special_tokens function to each example in the train_dataset,
# modifying the examples by replacing the "{sep_token}" placeholder in the "question" field with "<sep>"
train_dataset = train_dataset.map(add_special_tokens)

# Apply the convert_to_features function to each example in the train_dataset,
# converting the "context" and "question" fields of each example to tokenized tensors using the T5 tokenizer
# and returning a dictionary of input features for the T5 model
train_dataset = train_dataset.map(convert_to_features, batched=True)

  0%|          | 0/128619 [00:00<?, ?ex/s]

  0%|          | 0/128619 [00:00<?, ?ex/s]

  0%|          | 0/129 [00:00<?, ?ba/s]



In [None]:
# Apply the add_eos_examples function to each example in the valid_dataset,
# modifying the examples by adding the "</s>" suffix to the "context" and "question" fields
valid_dataset = valid_dataset.map(add_eos_examples)

# Apply the add_special_tokens function to each example in the valid_dataset,
# modifying the examples by replacing the "{sep_token}" placeholder in the "question" field with "<sep>"
valid_dataset = valid_dataset.map(add_special_tokens)

# Apply the convert_to_features function to each example in the valid_dataset,
# converting the "context" and "question" fields of each example to tokenized tensors using the T5 tokenizer
# and returning a dictionary of input features for the T5 model
valid_dataset = valid_dataset.map(convert_to_features, batched=True)

  0%|          | 0/1666 [00:00<?, ?ex/s]

  0%|          | 0/1666 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
# Define a list of column names to remove from the dataset
columns_removed = ['id', 'title', 'context', 'question', 'answers']

# Define a list of column names to keep in the dataset
columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

In [None]:
# Remove the specified columns from the 'train_dataset' dataset
train_dataset = train_dataset.remove_columns(columns_removed)

# Remove the specified columns from the 'valid_dataset' dataset
valid_dataset = valid_dataset.remove_columns(columns_removed)

### SAVE DATA

In [None]:
# Set the format of the 'train_dataset' dataset to 'torch'
train_dataset.set_format(type='torch')

# Set the format of the 'valid_dataset' dataset to 'torch'
valid_dataset.set_format(type='torch')

In [None]:
# Train and Validation datasets
train_dataset, valid_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
     num_rows: 128619
 }), Dataset({
     features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
     num_rows: 1666
 }))

In [None]:
# Change the current working directory to the root directory of the filesystem
%cd ../../../../..

/


In [None]:
# Save the 'train_dataset' dataset to a file called 'train_dataset.pt'
torch.save(train_dataset, 'train_dataset.pt')

# Save the 'valid_dataset' dataset to a file called 'valid_dataset.pt'
torch.save(valid_dataset, 'valid_dataset.pt')

In [None]:
# Log in to the Hub
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


## FINE TUNING

In [None]:
# A dataclass for formatting the data in a way that is expected by the model's forward method
@dataclass
class T2TDataCollator():
    def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
        A dictionary of tensors
        """
        # Stack the 'input_ids' from each example in the batch
        input_ids = torch.stack([example['input_ids'] for example in batch])
        
        # Stack the 'labels' from each example in the batch, replacing any 0 values with -100
        lm_labels = torch.stack([example['labels'] for example in batch])
        lm_labels[lm_labels == 0] = -100
        
        # Stack the 'attention_mask' from each example in the batch
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        
        # Stack the 'decoder_attention_mask' from each example in the batch
        decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

        # Return a dictionary of the stacked tensors
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': lm_labels,
            'decoder_attention_mask': decoder_attention_mask
        }


In [None]:
# Specify the directory where the model and training artifacts will be saved
output_dir="./models"

# Specify the number of examples per device in a single training batch
per_device_train_batch_size=4

# Specify the number of examples per device in a single evaluation batch
per_device_eval_batch_size=4

# Specify the number of gradient accumulation steps, which determine the actual batch size
# The actual batch size is equal to per_device_train_batch_size * gradient_accumulation_steps * num_devices
gradient_accumulation_steps=16

# Specify the learning rate used in the training process
learning_rate=1e-4

# Specify the number of training epochs
# An epoch is a single pass through the entire training dataset
num_train_epochs=3

# Specify how often to log training progress
logging_steps=100

# Specify the name of the run
# This is useful for distinguishing different runs in tools like Weights & Biases
run_name="flash-cards"

# Specify the evaluation strategy
# Possible values include "epoch" (evaluate after each epoch) and "steps" (evaluate after a certain number of steps)
evaluation_strategy="steps"

# Specify how often to save the model
save_steps=500

# Specify a service to report training progress to
report_to="wandb"

# Specify whether to push the model to the model hub
push_to_hub=True

# Specify the model ID to use when pushing the model to the model hub
push_to_hub_model_id="flash-cards"

In [None]:
training_args = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    learning_rate = learning_rate,
    num_train_epochs = num_train_epochs,
    logging_steps = logging_steps,
    run_name = run_name,
    evaluation_strategy = evaluation_strategy,
    save_steps = save_steps,
    report_to = report_to,
    push_to_hub = push_to_hub,
    push_to_hub_model_id = push_to_hub_model_id
  )

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices


In [None]:
# Initialize our Trainer
trainer = Trainer(
    # Specify the model to train
    model=model,

    # Specify the training arguments
    args=training_args,

    # Specify the training dataset
    train_dataset=train_dataset,

    # Specify the validation dataset
    eval_dataset=valid_dataset,

    # Specify the data collator
    data_collator=T2TDataCollator()
)

# Start training
trainer.train()

# When training is done, push the fine-tuned model to the model hub
trainer.push_to_hub("flash-cards")

# Finish the run in Weights & Biases
wandb.finish()

/content/drive/MyDrive/Colab Notebooks/./models is already a clone of https://huggingface.co/ilkimayd/flash-cards. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 128619
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 6027
  Number of trainable parameters = 222882816
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

## TESTING MODEL

In [None]:
# downloading the model
hfmodel = T5ForConditionalGeneration.from_pretrained("ilkimayd/flash-cards")

In [None]:
def question_generate(input_string, **generator_args):
  # Set default generator arguments
  default_generator_args = {
    "max_length": 256,  # maximum number of tokens to generate
    "num_beams": 4,  # number of beams to use when generating text
    "length_penalty": 1.5,  # length penalty to use when generating text
    "no_repeat_ngram_size": 3,  # size of n-grams that the model should not repeat
    "early_stopping": True,  # whether to stop generating text as soon as max length is reached
  }
  
  # Update default generator arguments with provided generator arguments
  generator_args = {**default_generator_args, **generator_args}
  
  # Modify input string to indicate that we want to generate questions
  input_string = "generate questions: " + input_string + " </s>"
  
  # Encode input string as input tokens
  input_ids = tokenizer.encode(input_string, return_tensors="pt",max_length=512,truncation=True)
  
  # Generate text based on input tokens and generator arguments
  res = hfmodel.generate(input_ids, **generator_args)
  
  # Decode and split generated text to obtain list of questions
  output = tokenizer.batch_decode(res, skip_special_tokens=True)
  output = [item.split("<sep>") for item in output]
  
  # Return list of questions as output
  return output

In [None]:
def run_model(dataset):
  # Generate questions for each data point in the dataset
  generated_questions = [question_generate(data["context"]) for data in dataset]
  
  # Return list of generated questions
  return generated_questions

In [None]:
# Read the test data from the 'test.json' file and create a dataset object from it
test_data = read_data("/content/drive/MyDrive/Colab Notebooks/test.json")

In [None]:
question_generate("The Turkish War of Independence (19 May 1919 – 24 July 1923) was a series of military campaigns waged by the Turkish National Movement after parts of the Ottoman Empire were occupied and partitioned following its defeat in World War I. ") 

[['when was the turkish war of independence?']]

In [None]:
# Generate questions for each data point in the test dataset
predictions = run_model(test_data)

# The predictions variable now contains a list of generated questions

In [None]:
predictions

[[['where is the library housed?']],
 [['where is the library housed?']],
 [['where is the library housed?']],
 [['where is the library housed?']],
 [['where is the library housed?']],
 [['how many languages are in the library of congress?']],
 [['how many languages are in the library of congress?']],
 [['how many languages are in the library of congress?']],
 [['when did the library of congress move to washington university?']],
 [['when did the library of congress move to washington university?']],
 [['when did the library of congress move to washington university?']],
 [['when did the library of congress move to washington university?']],
 [['when did the library of congress move to washington university?']],
 [['in what year did another fire struck the library?']],
 [['in what year did another fire struck the library?']],
 [['when did the construction of a separate library begin?']],
 [['when did the construction of a separate library begin?']],
 [['what is the czech republic?']],


### BLEU SCORE

The BLEU (Bilingual Evaluation Understudy) score is a metric used to evaluate the quality of machine-generated translations. It compares the machine-generated translation to one or more reference translations, and calculates a score based on the number of matching n-grams (sequences of n words) between the two. The BLEU score is a standard metric used in the field of natural language processing (NLP), and is often used to compare different machine translation models or to tune the hyperparameters of a machine translation model.


In [None]:
# Import the sentence_bleu function from the bleu_score module of the nltk.translate package
from nltk.translate.bleu_score import sentence_bleu

In [None]:
# Compute BLEU scores using different n-grams as the basis for comparison
one_gram_score = np.mean([sentence_bleu(predicted[0], real, weights=(1, 0, 0, 0)) for predicted, real in zip(predictions, test_data["question"])])
two_gram_score = np.mean([sentence_bleu(predicted[0], real, weights=(0, 1, 0, 0)) for predicted, real in zip(predictions,test_data["question"])])
three_gram_score = np.mean([sentence_bleu(predicted[0], real, weights=(0, 0, 1, 0)) for predicted, real in zip(predictions, test_data["question"])])
four_gram_score = np.mean([sentence_bleu(predicted[0], real, weights=(0, 0, 0, 1)) for predicted, real in zip(predictions, test_data["question"])])
cumulative_score = np.mean([sentence_bleu(predicted[0], real, weights=(.25, .25, .25, .25)) for predicted, real in zip(predictions, test_data["question"])])

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# Print BLEU scores
print("1-gram BLEU score:", one_gram_score)
print("2-gram BLEU score:", two_gram_score)
print("3-gram BLEU score:", three_gram_score)
print("4-gram BLEU score:", four_gram_score)
print("Cumulative BLEU score:", cumulative_score)

1-gram BLEU score: 0.6547282818746354
2-gram BLEU score: 0.4429703665280654
3-gram BLEU score: 0.3506871171835451
4-gram BLEU score: 0.3129274884989012
Cumulative BLEU score: 0.401903069083785


### METEOR SCORE

The METEOR score is a metric used to evaluate the quality of machine-generated translations. It compares the machine-generated translation to one or more reference translations, and calculates a score based on the degree of word overlap between the two. The METEOR score is sensitive to word order, and takes into account both exact and stemmed word matches. It is a standard metric used in the field of natural language processing (NLP), and is often used to compare different machine translation models or to tune the hyperparameters of a machine translation model.


In [None]:
# Import the meteor_score function from the bleu_score module of the nltk.translate.meteor_score package
from nltk.translate.meteor_score import meteor_score

# Import the word_tokenize function from the bleu_score module of the nltk.tokenize package
from nltk.tokenize import word_tokenize

In [None]:
# Import the nltk package
import nltk

# Download necessary nltk resources
nltk.download("punkt")
nltk.download('omw-1.4')
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Compute average METEOR score for generated questions
meteor = np.mean([meteor_score([word_tokenize(predicted[0][0])], word_tokenize(real)) for predicted, real in zip(predictions, test_data["question"])])

# The meteor variable now contains the average METEOR score for the generated questions

In [None]:
print("METEOR score:", meteor)


METEOR score: 0.42133954040124955


### ROUGE SCORE

The ROUGE score is a metric used to evaluate the quality of machine-generated translations. It compares the machine-generated translation to one or more reference translations, and calculates a score based on the degree of word overlap between the two. The ROUGE score is sensitive to word order, and takes into account both exact and stemmed word matches. It is a standard metric used in the field of natural language processing (NLP), and is often used to compare different machine translation models or to tune the hyperparameters of a machine translation model.


In [None]:
# Install the rouge-score package
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=2f13b15cc45b6dd2488b226f6509f2ea86f0380134d9ccd5572409dd3e8bfbb8
  Stored in directory: /root/.cache/pip/wheels/24/55/6f/ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
# Import the RougeScorer class from the rouge_scorer module of the rouge_score package
from rouge_score.rouge_scorer import RougeScorer

In [None]:
# Create an instance of the RougeScorer class with the 'rouge1' and 'rougeL' metrics
scorer = RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [None]:
# Compute ROUGE scores for generated questions and reference questions
scores = [scorer.score(predicted[0][0], real) for predicted, real in zip(predictions[::], test_data["question"])]

# The scores variable now contains a list of dictionaries, each dictionary containing the ROUGE scores for a pair of generated question and reference question


In [None]:
# Calculate the mean ROUGE-1 and ROUGE-L scores
rouge1_score = np.mean([score["rouge1"] for score in scores])
rougeL_score = np.mean([score["rougeL"] for score in scores])

In [None]:
print("ROUGE1 score:", rouge1_score)
print("ROUGEL score:", rougeL_score)

ROUGE1 score: 0.44872763702552915
ROUGEL score: 0.4356910909618465


### Precision, Recall and F1 Scores

Precision, recall, and F1 score are evaluation metrics used to measure the quality of machine-generated translations. They are commonly used in the field of natural language processing (NLP).

- **Precision** is a measure of the fraction of the machine-generated translation that is correct. It is calculated as the number of correctly translated words divided by the total number of words in the machine-generated translation. A higher precision score indicates a higher quality translation.
- **Recall** is a measure of the fraction of the reference translation that is correctly translated. It is calculated as the number of correctly translated words divided by the total number of words in the reference translation. A higher recall score indicates a higher quality translation.
- **F1 score** is the harmonic mean of precision and recall, and is a measure of the overall quality of the machine-generated translation. It is calculated as the harmonic mean of precision and recall, using the following formula:
  F1 = 2 * (precision * recall) / (precision + recall)
  A higher F1 score indicates a higher quality translation.

In general, a machine-generated translation with high precision will have fewer errors, but may not cover all the content of the reference translation. On the other hand, a machine-generated translation with high recall will cover more of the content of the reference translation, but may have more errors. The F1 score takes both precision and recall into account and provides a balance between the two.


In [None]:
# Compute mean F1, recall, and precision for the ROUGE-1 metric
rouge1_f1 = np.mean([score["rouge1"].fmeasure for score in scores])
rouge1_recall = np.mean([score["rouge1"].recall for score in scores])
rouge1_precision = np.mean([score["rouge1"].precision for score in scores])

# Compute mean F1, recall, and precision for the ROUGE-L metric
rougeL_f1 = np.mean([score["rougeL"].fmeasure for score in scores])
rougeL_recall = np.mean([score["rougeL"].recall for score in scores])
rougeL_precision = np.mean([score["rougeL"].precision for score in scores])

# The rouge1_f1, rouge1_recall, rouge1_precision, rougeL_f1, rougeL_recall, and rougeL_precision variables now contain the mean F1, recall, and precision for the ROUGE-1 and ROUGE-L metrics, respectively

In [None]:
print("Mean F1 score (ROUGE-1):", rouge1_f1)
print("Mean recall (ROUGE-1):", rouge1_recall)
print("Mean precision (ROUGE-1):", rouge1_precision)

print("Mean F1 score (ROUGE-L):", rougeL_f1)
print("Mean recall (ROUGE-L):", rougeL_recall)
print("Mean precision (ROUGE-L):", rougeL_precision)

Mean F1 score (ROUGE-1): 0.4421757434058688
Mean recall (ROUGE-1): 0.45132812122415145
Mean precision (ROUGE-1): 0.45267904644656737
Mean F1 score (ROUGE-L): 0.42933161977817785
Mean recall (ROUGE-L): 0.4381981737369261
Mean precision (ROUGE-L): 0.4395434793704354


In [None]:
test_data

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1587
})