In [2]:
pip install -q transformers datasets rouge_score

[K     |████████████████████████████████| 4.4 MB 31.9 MB/s 
[K     |████████████████████████████████| 362 kB 52.2 MB/s 
[K     |████████████████████████████████| 596 kB 70.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 53.6 MB/s 
[K     |████████████████████████████████| 101 kB 11.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 44.5 MB/s 
[K     |████████████████████████████████| 212 kB 76.8 MB/s 
[K     |████████████████████████████████| 140 kB 73.8 MB/s 
[K     |████████████████████████████████| 127 kB 75.0 MB/s 
[K     |████████████████████████████████| 94 kB 3.1 MB/s 
[K     |████████████████████████████████| 271 kB 77.2 MB/s 
[K     |████████████████████████████████| 144 kB 74.3 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

train_df = pd.read_csv("ie_train_data.csv")
test_df = pd.read_csv("ie_test_data.csv")

print("size of train:", train_df.shape)
print("size of test:", test_df.shape)

train_dict = {"text":[], "summary":[]}
for summary, text in zip(train_df['rps'], train_df['sentence']):
    train_dict['text'].append(text)
    train_dict['summary'].append(eval(summary)[0])        

test_dict = {"text":[], "summary":[]}
for summary, text in zip(test_df['rps'], test_df['sentence']):
    test_dict['text'].append(text)
    test_dict['summary'].append(eval(summary)[0])


dataset_train = Dataset.from_dict(train_dict)
dataset_test = Dataset.from_dict(test_dict)

dataset = DatasetDict({
    'train': dataset_train, 
    'test': dataset_test
    })

dataset

size of train: (602, 3)
size of test: (313, 3)


DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 602
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 313
    })
})

In [2]:
from transformers import AutoTokenizer

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [4]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cuda')

print("Model initialized")

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Model initialized


In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.001,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Using cuda_amp half precision backend


In [8]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 602
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 380


Epoch,Training Loss,Validation Loss
1,No log,0.286507
2,No log,0.216794
3,No log,0.196322
4,No log,0.194851
5,No log,0.194423


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 313
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 313
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 313


TrainOutput(global_step=380, training_loss=0.3346403824655633, metrics={'train_runtime': 87.8261, 'train_samples_per_second': 34.272, 'train_steps_per_second': 4.327, 'total_flos': 210304559001600.0, 'train_loss': 0.3346403824655633, 'epoch': 5.0})

In [9]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin
tokenizer config file saved in ./results/tokenizer_config.json
Special tokens file saved in ./results/special_tokens_map.json


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="results", tokenizer="results")

print("Model loaded")

In [11]:
from tqdm.notebook import tqdm

generated_summary = []
reference_summary = []
for index in tqdm(range(len(test_dict['text']))):
    # print("Text:", test_dict['text'][index])
    # print("Org Summary:", test_dict['summary'][index])
    # print("T5 Summary:", summarizer(test_dict['text'][index], min_length=1, max_length=10))
    generated_summary.append(summarizer(test_dict['text'][index], min_length=1, 
                                        max_length=min(10, len(test_dict['text'][index])))[0]['summary_text'])
    reference_summary.append(test_dict['summary'][index])

  0%|          | 0/313 [00:00<?, ?it/s]

Your max_length is set to 10, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 10, but you input_length is only 8. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


In [12]:
from datasets import load_metric

rouge_score = load_metric("rouge")
scores = rouge_score.compute(predictions=[generated_summary], references=[reference_summary])

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
f1s = dict((rn, round(scores[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
print("F1s:", f1s)
recalls = dict((rn, round(scores[rn].mid.recall * 100, 2)) for rn in rouge_names)
print("Recalls:", recalls)
precisions = dict((rn, round(scores[rn].mid.precision * 100, 2)) for rn in rouge_names)
print("Precisions:", precisions)

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

F1s: {'rouge1': 79.56, 'rouge2': 62.95, 'rougeL': 71.42, 'rougeLsum': 71.42}
Recalls: {'rouge1': 77.5, 'rouge2': 61.31, 'rougeL': 69.57, 'rougeLsum': 69.57}
Precisions: {'rouge1': 81.73, 'rouge2': 64.67, 'rougeL': 73.37, 'rougeLsum': 73.37}


* T5-base

```
F1s: {'rouge1': 79.56, 'rouge2': 62.95, 'rougeL': 71.42, 'rougeLsum': 71.42}
Recalls: {'rouge1': 77.5, 'rouge2': 61.31, 'rougeL': 69.57, 'rougeLsum': 69.57}
Precisions: {'rouge1': 81.73, 'rouge2': 64.67, 'rougeL': 73.37, 'rougeLsum': 73.37}
```

In [13]:
!zip -r results.zip results/

  adding: results/ (stored 0%)
  adding: results/config.json (deflated 63%)
  adding: results/special_tokens_map.json (deflated 86%)
  adding: results/tokenizer_config.json (deflated 83%)
  adding: results/pytorch_model.bin (deflated 11%)
  adding: results/training_args.bin (deflated 47%)
  adding: results/runs/ (stored 0%)
  adding: results/runs/Jun29_17-11-05_db932393cabf/ (stored 0%)
  adding: results/runs/Jun29_17-11-05_db932393cabf/1656522671.6238956/ (stored 0%)
  adding: results/runs/Jun29_17-11-05_db932393cabf/1656522671.6238956/events.out.tfevents.1656522671.db932393cabf.159.1 (deflated 62%)
  adding: results/runs/Jun29_17-11-05_db932393cabf/events.out.tfevents.1656522671.db932393cabf.159.0 (deflated 60%)
  adding: results/tokenizer.json (deflated 74%)
