--------------------------------------
**Author**: Gunnvant

**Description**: datasets preprocessing pipeline

--------------------------------------

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
ckpt = "t5-small"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_dataset = load_dataset("csv",data_files="../processed_summary.csv")

Downloading data files: 100%|█████████████████████████████████████████| 1/1 [00:00<00:00, 4505.16it/s]
Extracting data files: 100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 290.12it/s]
Generating train split: 2225 examples [00:00, 20077.08 examples/s]


In [3]:
raw_dataset['train'][0]

{'Text': 'UK economy facing \'major risks\'\n\nThe UK manufacturing sector will continue to face "serious challenges" over the next two years, the British Chamber of Commerce (BCC) has said.\n\nThe group\'s quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced "major risks" and warned that growth is set to slow. It recently forecast economic growth will slow from more than 3% in 2004 to a little below 2.5% in both 2005 and 2006.\n\nManufacturers\' domestic sales growth fell back slightly in the quarter, the survey of 5,196 firms found. Employment in manufacturing also fell and job expectations were at their lowest level for a year.\n\n"Despite some positive news for the export sector, there are worrying signs for manufacturing," the BCC said. "These results reinforce our concern over the se

### Preprocessing Steps

- Use the data statistics to find the length of questions to be truncated at as `max_input_length`
- Find out the `max_target_length` as well from the training data
- Tokenize text and summaries. Create a new field `labels` as the input tokens from summary tokenization

See [notebook](./02.ipynb)

In [4]:
max_input_length = 723
max_target_length = 313

In [5]:
tokenizer = AutoTokenizer.from_pretrained(ckpt)

In [6]:
prefix = "summarize: "
def preprocess(examples):
    inputs = [prefix + doc for doc in examples["Text"]]
    model_inputs = tokenizer(inputs,max_length=max_input_length,
        truncation=True,padding=True)
    labels = tokenizer(examples['Summary'],max_length=max_target_length,truncation=True,padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
from transformers import DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=ckpt)

In [9]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

### Evaluation

Use ROUGE score

In [9]:
import evaluate
rouge_score = evaluate.load("rouge")

In [10]:
from nltk.tokenize import sent_tokenize
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [11]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["Text"]]
    return metric.compute(predictions=summaries, references=dataset["Summary"])

In [12]:
score = evaluate_baseline(raw_dataset["train"], rouge_score) ## change to validation dataset

In [13]:
score

{'rouge1': 0.44729677794630696,
 'rouge2': 0.35164807984363006,
 'rougeL': 0.3422369099766631,
 'rougeLsum': 0.4052121651347588}

In [14]:
### Create tokenized dataset
tokenized_dataset = raw_dataset.map(preprocess,batched=True)

Map: 100%|████████████████████████████████████████████████| 2225/2225 [00:02<00:00, 846.34 examples/s]


In [23]:
tokenized_dataset = tokenized_dataset.remove_columns(['Text',"Summary"])

In [24]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2225
    })
})

### Train the model

In [17]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 1
# Show the training loss with every epoch
logging_steps = len(tokenized_dataset["train"]) // batch_size

args = Seq2SeqTrainingArguments(
    output_dir="finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
)

In [18]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
features = [tokenized_dataset["train"][i] for i in range(2)]
#data_collator(features)

In [26]:
from transformers import Seq2SeqTrainer

In [28]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()