In [36]:
# pip install -q transformers datasets rouge_score

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

train_df = pd.read_csv("ie_train_data.csv")
test_df = pd.read_csv("ie_test_data.csv")

print("size of train:", train_df.shape)
print("size of test:", test_df.shape)

train_dict = {"text":[], "summary":[]}
for summary, text in zip(train_df['rps'], train_df['sentence']):
    train_dict['text'].append(text)
    train_dict['summary'].append(eval(summary)[0])        

test_dict = {"text":[], "summary":[]}
for summary, text in zip(test_df['rps'], test_df['sentence']):
    test_dict['text'].append(text)
    test_dict['summary'].append(eval(summary)[0])


dataset_train = Dataset.from_dict(train_dict)
dataset_test = Dataset.from_dict(test_dict)

dataset = DatasetDict({
    'train': dataset_train, 
    'test': dataset_test
    })

dataset

size of train: (602, 3)
size of test: (313, 3)


DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 602
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 313
    })
})

In [2]:
from transformers import AutoTokenizer

model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [4]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cuda')

print("Model initialized")

Model initialized


In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.001,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using cuda_amp half precision backend


In [8]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 602
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 380


Epoch,Training Loss,Validation Loss
1,No log,0.797604
2,No log,0.42313
3,No log,0.353685
4,No log,0.330607
5,No log,0.324485


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 313
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 313
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 313


TrainOutput(global_step=380, training_loss=0.6992067035875822, metrics={'train_runtime': 38.6887, 'train_samples_per_second': 77.8, 'train_steps_per_second': 9.822, 'total_flos': 46740502609920.0, 'train_loss': 0.6992067035875822, 'epoch': 5.0})

In [9]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin
tokenizer config file saved in ./results/tokenizer_config.json
Special tokens file saved in ./results/special_tokens_map.json


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="results", tokenizer="results")

print("Model loaded")

In [18]:
from tqdm.notebook import tqdm

generated_summary = []
reference_summary = []
for index in tqdm(range(len(test_dict['text']))):
    # print("Text:", test_dict['text'][index])
    # print("Org Summary:", test_dict['summary'][index])
    # print("T5 Summary:", summarizer(test_dict['text'][index], min_length=1, max_length=10))
    generated_summary.append(summarizer(test_dict['text'][index], min_length=1, 
                                        max_length=min(10, len(test_dict['text'][index])))[0]['summary_text'])
    reference_summary.append(test_dict['summary'][index])

  0%|          | 0/313 [00:00<?, ?it/s]

Your max_length is set to 10, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 10, but you input_length is only 8. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


In [19]:
from datasets import load_metric

rouge_score = load_metric("rouge")
scores = rouge_score.compute(predictions=[generated_summary], references=[reference_summary])

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
f1s = dict((rn, round(scores[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
print("F1s:", f1s)
recalls = dict((rn, round(scores[rn].mid.recall * 100, 2)) for rn in rouge_names)
print("Recalls:", recalls)
precisions = dict((rn, round(scores[rn].mid.precision * 100, 2)) for rn in rouge_names)
print("Precisions:", precisions)

F1s: {'rouge1': 75.12, 'rouge2': 55.04, 'rougeL': 64.86, 'rougeLsum': 64.86}
Recalls: {'rouge1': 75.93, 'rouge2': 55.63, 'rougeL': 65.56, 'rougeLsum': 65.56}
Precisions: {'rouge1': 74.33, 'rouge2': 54.46, 'rougeL': 64.18, 'rougeLsum': 64.18}


* T5-small

```
F1s: {'rouge1': 75.12, 'rouge2': 55.04, 'rougeL': 64.86, 'rougeLsum': 64.86}
Recalls: {'rouge1': 75.93, 'rouge2': 55.63, 'rougeL': 65.56, 'rougeLsum': 65.56}
Precisions: {'rouge1': 74.33, 'rouge2': 54.46, 'rougeL': 64.18, 'rougeLsum': 64.18
```

In [14]:
!zip -r results.zip results/

updating: results/ (stored 0%)
  adding: results/pytorch_model.bin (deflated 16%)
  adding: results/tokenizer.json (deflated 74%)
  adding: results/tokenizer_config.json (deflated 83%)
  adding: results/runs/ (stored 0%)
  adding: results/runs/Jun28_16-14-56_5d13485ffe6f/ (stored 0%)
  adding: results/runs/Jun28_16-14-56_5d13485ffe6f/1656432898.928341/ (stored 0%)
  adding: results/runs/Jun28_16-14-56_5d13485ffe6f/1656432898.928341/events.out.tfevents.1656432898.5d13485ffe6f.439.3 (deflated 62%)
  adding: results/runs/Jun28_16-14-56_5d13485ffe6f/events.out.tfevents.1656432898.5d13485ffe6f.439.2 (deflated 60%)
  adding: results/runs/Jun28_16-33-08_5d13485ffe6f/ (stored 0%)
  adding: results/runs/Jun28_16-33-08_5d13485ffe6f/events.out.tfevents.1656433988.5d13485ffe6f.918.0 (deflated 60%)
  adding: results/runs/Jun28_16-33-08_5d13485ffe6f/1656433988.9050965/ (stored 0%)
  adding: results/runs/Jun28_16-33-08_5d13485ffe6f/1656433988.9050965/events.out.tfevents.1656433988.5d13485ffe6f.918.1 