In [1]:
By Matthew White

Citations: Matthew Wattson, https://keras.io/guides/keras_nlp/transformer_pretraining/

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nyt-news/nyt_all.csv
/kaggle/input/nyt-news/nyt_val.csv
/kaggle/input/nyt-news/nyt_train.csv
/kaggle/input/nyt-news/nyt_test.csv


In [2]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=94f7c89b6a0addd79fa46a43fd5ae911911a8c454a7876159c3b84bd082f3f6a
  Stored in directory: /root/.cache/pip/wheels/84/ac/6b/38096e3c5bf1dc87911e3585875e21a3ac610348e740409c76
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0m

In [3]:
!pip install keras-nlp

Collecting keras-nlp
  Downloading keras_nlp-0.3.1-py3-none-any.whl (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.1/151.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tensorflow-text
  Downloading tensorflow_text-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tensorboard<2.7,>=2.6.0
  Downloading tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting typing-extensions<3.11,>=3.7
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Collecting numpy
  Downloading numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m66.7 MB/s[0

In [4]:
import os
import logging

import nltk
import numpy as np
import keras_nlp
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Establish some starting parameters where we can come back and change them easily to try different hyperparameters.

In [5]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 8  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "t5-small"

Loading our data set into train, validation, and test

In [6]:
from datasets import load_dataset

raw_datasets = load_dataset('csv', data_files={'train':'/kaggle/input/nyt-news/nyt_train.csv','validation':'/kaggle/input/nyt-news/nyt_val.csv', 'test':'/kaggle/input/nyt-news/nyt_test.csv'},delimiter='~')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a8f2af29a5ec2f9a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a8f2af29a5ec2f9a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'maintext'],
        num_rows: 6000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'maintext'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'maintext'],
        num_rows: 1000
    })
})


In [8]:
print(raw_datasets['train'][0])

{'Unnamed: 0': 0, 'title': 'On Sundays, Foursquare Co-founder Goes Online, Then Out for a Walk', 'maintext': 'NIBBLE OF NEW YORK I don’t have a set schedule. But we go on these long walks. We just kind of snake our way through the Lower East Side, parts of the East Village, through SoHo, come up through Chelsea. And we bring people with us, and it’s a way to introduce people to new things. It’s about trying to do as much as we can in one day, but in bite-sized increments. Like we’ll stop at La Esquina and get a horchata. You’ll walk down Orchard Street and stop at four different galleries. We’ll go to Hester Street and each have one dumpling. You might have seven appetizers over the course of the day and stop and get two or three things to drink. And get to check out a little art on the way, and just try to find things we haven’t found before. PATH AND DEVIATIONS Sometimes we’ll do that for five or six hours. The path is always similar: walk down Avenue B, end up on Clinton Street, wal

In [9]:
#raw_datasets = raw_datasets.train_test_split(
#train_size=TRAIN_TEST_SPLIT, test_size=TRAIN_TEST_SPLIT
#)

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

tokenizing the dataset with the tokenizer from the model we will be re-training

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, model_max_length=512)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [11]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [12]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["maintext"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["title"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [13]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

 Here we define our 'model'. We are using a model that already exists so we are really just creating that then setting up the training parameters so we can re-train it.

In [14]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

2022-12-06 12:28:11.137370: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 12:28:11.138584: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 12:28:11.139320: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 12:28:11.141452: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

2022-12-06 12:28:30.853988: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-12-06 12:28:32.475298: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 65798144 exceeds 10% of free system memory.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [16]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [17]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [18]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

Finally we fit the t5 model to our data set

In [19]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)

2022-12-06 12:29:01.788111: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)




<keras.callbacks.History at 0x7f9020673510>

# Example 1:

In [28]:
print(raw_datasets["test"][0]["maintext"])

A Bronx teenager who the police say stabbed his mother three times in the chest was charged with murder yesterday. The man, Brandon Elliott, 19, was charged in the attack, which occurred Tuesday on East 224th Street in the Williamsbridge section. He lived with his mother, Bridget Johnson, 42. She was taken to Our Lady of Mercy Medical Center, where she died last night, the police said. Tina Kelley (NYT)


Generated title from model

In [42]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    raw_datasets["test"][0]["maintext"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)

Your max_length is set to 128, but you input_length is only 97. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
2022-12-06 16:15:10.742308: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 65798144 exceeds 10% of free system memory.
2022-12-06 16:15:11.169733: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 65798144 exceeds 10% of free system memory.
2022-12-06 16:15:11.579521: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 65798144 exceeds 10% of free system memory.


[{'summary_text': 'Brandon Elliott, 19, stabbed his mother three times in the chest .'}]

Actual title

In [43]:
print(raw_datasets["test"][0]["title"])

Metro Briefing | New York: Bronx: Teenager Charged In Murder


!pip install evaluate

Here is the rouge score of our model generated title!

In [55]:
import evaluate
rouge = evaluate.load('rouge')
predictions = ["Brandon Elliott, 19, stabbed his mother three times in the chest ."]
references = ["A Bronx teenager who the police say stabbed his mother three times in the chest was charged with murder yesterday. The man, Brandon Elliott, 19, was charged in the attack, which occurred Tuesday on East 224th Street in the Williamsbridge section. He lived with his mother, Bridget Johnson, 42. She was taken to Our Lady of Mercy Medical Center, where she died last night, the police said. Tina Kelley (NYT)"]
results = rouge.compute(predictions=predictions,references=references)
print(results)

{'rouge1': 0.271604938271605, 'rouge2': 0.22784810126582278, 'rougeL': 0.19753086419753085, 'rougeLsum': 0.19753086419753085}


In [49]:
def evaluate_baseline(maintext,title, metric):
    summaries = summarizer(maintext)
    return metric.compute(predictions=summaries, references=title)

 # Example 2:

In [29]:
print(raw_datasets["test"][1]["maintext"])

SAN ANTONIO, Texas — When the lights went out Monday night in the Alazán-Apache housing project in San Antonio — which stands in one of the city’s poorest ZIP codes — the traffic signals in the neighborhood flickered off and storekeepers pulled down their shutters. For residents, there was little left to do but huddle under blankets and hope that their children wouldn’t fall ill. “I need to take my kids somewhere to keep them warm. I don’t know where,” said Ricardo Cruz, 42, who lives at the Alazán-Apache Courts with his wife and five children, between 5 and 13 years old, and who has been without electricity since 7 p.m. Monday. While the rolling blackouts in Texas have left some 4 million residents without power in brutally cold weather, experts and community groups say that many marginalized communities were the first to be hit with power outages, and if history serves as a guide, could be among the last to be reconnected. This is particularly perilous, they say, given that low-incom

Generated title from model

In [30]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    raw_datasets["test"][1]["maintext"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)

[{'summary_text': 'Alazán-Apache Housing Project in San Antonio'}]

Actual title

In [32]:
print(raw_datasets["test"][1]["title"])

Texas Blackouts Hit Minority Neighborhoods Especially Hard


Here is the rouge score of our model generated title!

In [56]:
import evaluate
rouge = evaluate.load('rouge')
predictions = ["Alazán-Apache Housing Project in San Antonio"]
references = ["SAN ANTONIO, Texas — When the lights went out Monday night in the Alazán-Apache housing project in San Antonio — which stands in one of the city’s poorest ZIP codes — the traffic signals in the neighborhood flickered off and storekeepers pulled down their shutters. For residents, there was little left to do but huddle under blankets and hope that their children wouldn’t fall ill. “I need to take my kids somewhere to keep them warm. I don’t know where,” said Ricardo Cruz, 42, who lives at the Alazán-Apache Courts with his wife and five children, between 5 and 13 years old, and who has been without electricity since 7 p.m. Monday. While the rolling blackouts in Texas have left some 4 million residents without power in brutally cold weather, experts and community groups say that many marginalized communities were the first to be hit with power outages, and if history serves as a guide, could be among the last to be reconnected. This is particularly perilous, they say, given that low-income households can lack the financial resources to flee to safety or to rebound after the disruption."]
results = rouge.compute(predictions=predictions,references=references)
print(results)

{'rouge1': 0.0792079207920792, 'rouge2': 0.07, 'rougeL': 0.0792079207920792, 'rougeLsum': 0.0792079207920792}


**Trying to push this model to hugging face but it is not accepting my authorization. Doesn't matter too much, not part of the assignment.**

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
model.push_to_hub("t5-small_nytnews")
tokenizer.push_to_hub("t5-small_nytnews")

RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-638f383e-42e51d0f30ccd114386545a8)

Repository Not Found for url: https://huggingface.co/api/repos/create.
Please make sure you specified the correct `repo_id` and `repo_type`.
If the repo is private, make sure you are authenticated.
Unauthorized - Unauthorized

# Trying to train another model **(extra!)**

In [19]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 8  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 3  # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "t5-small"

In [20]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [21]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["maintext"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["title"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, model_max_length=512)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [23]:
from datasets import load_dataset

raw_datasets = load_dataset('csv', data_files={'train':'/kaggle/input/nyt-news/nyt_train.csv','validation':'/kaggle/input/nyt-news/nyt_val.csv', 'test':'/kaggle/input/nyt-news/nyt_test.csv'},delimiter='~')

  0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [25]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

2022-12-06 15:51:26.966674: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 65798144 exceeds 10% of free system memory.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [27]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [28]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [29]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [30]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

In [31]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)

Epoch 1/3


OverflowError: out of range integral type conversion attempted

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    raw_datasets["test"][0]["maintext"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)