In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    pairwise,
)
from torch.utils.data import DataLoader

import torch
import pandas as pd

import evaluate
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

from utils.data_preprocessing import preprocess_data, split_data, split_data_all

%load_ext autoreload
%autoreload 2

In [2]:
topics, opinions, conclusions = preprocess_data()

In [3]:
data = topics.merge(conclusions[['topic_id', 'text']], on='topic_id', suffixes=('_topic', '_conclusion'))

# Group opinions by 'topic_id' and concatenate their texts
# opinions_grouped = opinions.groupby('topic_id')['text'].apply(lambda texts: ' '.join(texts)).reset_index()
opinions_grouped = opinions.groupby('topic_id')['text'].apply(lambda texts: '\n'.join(text.strip() for text in texts)).reset_index()

data = data.merge(opinions_grouped, on='topic_id', how='left')
data.rename(columns={'text_topic': 'topic_text', 'text_conclusion': 'target_text', 'text': 'opinions_text'}, inplace=True)

data['opinions_text'] = data['opinions_text'].fillna('')

data['input_text'] = 'summarize: ' + data['topic_text'] + ' ' + data['opinions_text']

data = data[['input_text', 'target_text']]
data = data.dropna(subset=['input_text', 'target_text'])
data = data[data['input_text'].str.strip() != '']
data = data[data['target_text'].str.strip() != '']

In [4]:
dataset = Dataset.from_pandas(data)

In [5]:
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [6]:
model_name = 't5-small'

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [8]:
max_input_length = 512
max_target_length = 150

def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [9]:
# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2656 [00:00<?, ? examples/s]



Map:   0%|          | 0/664 [00:00<?, ? examples/s]

In [10]:
rouge_metric = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # print(result)
    # print(result.items())

    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result['gen_len'] = np.mean(prediction_lens)

    # beam_predictions = model.generate(
    #     input_ids,
    #     max_length=max_target_length,
    #     num_beams=5,
    #     early_stopping=True
    # )

    # result['gen_len'] = np.mean([len(pred) for pred in beam_predictions])

    return {k: round(v, 4) for k, v in result.items()}

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./saved_models/summarization',
    eval_strategy='steps',
    eval_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    # gradient_accumulation_steps=2,
    num_train_epochs=2,
    save_steps=300,
    warmup_steps=100,
    learning_rate=5e-5,
    # weight_decay=0.001,
    optim="adamw_torch",
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=0,
    # save_strategy="no",
    load_best_model_at_end=True,
    predict_with_generate=True,
    generation_max_length=40,
    generation_num_beams=5,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
50,10.3068,7.155865,0.281,0.0827,0.1932,0.1932,38.9593
100,3.6223,1.755919,0.0,0.0,0.0,0.0,0.0
150,1.8553,1.645716,0.0,0.0,0.0,0.0,0.0
200,1.7216,1.59126,0.0003,0.0002,0.0003,0.0003,0.0587
250,1.7612,1.55885,0.0685,0.0254,0.0488,0.0487,7.8253
300,1.6115,1.544973,0.1618,0.0596,0.1148,0.1154,18.994
350,1.6898,1.5316,0.2846,0.0993,0.1991,0.1996,34.6867
400,1.5425,1.528818,0.2905,0.1012,0.2037,0.2042,35.3509
450,1.6249,1.523454,0.299,0.1037,0.21,0.2104,36.6009
500,1.6232,1.521338,0.3084,0.1079,0.2169,0.2172,37.4669


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=664, training_loss=2.460550210562097, metrics={'train_runtime': 1179.3869, 'train_samples_per_second': 4.504, 'train_steps_per_second': 0.563, 'total_flos': 718935649419264.0, 'train_loss': 2.460550210562097, 'epoch': 2.0})

In [13]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.5178840160369873, 'eval_rouge1': 0.3084, 'eval_rouge2': 0.1081, 'eval_rougeL': 0.2175, 'eval_rougeLsum': 0.2178, 'eval_gen_len': 37.5633, 'eval_runtime': 73.1439, 'eval_samples_per_second': 9.078, 'eval_steps_per_second': 0.574, 'epoch': 2.0}


In [14]:
trainer.save_model('saved_models/summarization/')
tokenizer.save_pretrained('saved_models/summarization/')

('saved_models/summarization/tokenizer_config.json',
 'saved_models/summarization/special_tokens_map.json',
 'saved_models/summarization/spiece.model',
 'saved_models/summarization/added_tokens.json')

In [19]:
model = T5ForConditionalGeneration.from_pretrained('saved_models/summarization')
tokenizer = T5Tokenizer.from_pretrained(model_name)

model.eval()

def summarize(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    input_ids = input_ids
    output_ids = model.generate(
        input_ids=input_ids,
        max_length=150,
        num_beams=5,
        early_stopping=True
    )
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

topic_id_example = topics['text'].iloc[1]
# input_text = generate_input_text(topic_id_example)
input_text = 'summarize: ' + topic_id_example
if input_text:
    conclusion = summarize(input_text)
    print("Generated Conclusion:\n", conclusion)

Generated Conclusion:
 with so many things in this world that few people agree on this is a nice change to see in regards the removal of so many cars that few people agree on this is a nice change to see in regards to the removal of so many cars


In [26]:
topic_id_example = topics['text'].iloc[8]
# input_text = generate_input_text(topic_id_example)
input_text = 'summarize: ' + topic_id_example
if input_text:
    conclusion = summarize(input_text)
    print("Generated Conclusion:\n", conclusion)

Generated Conclusion:
 there are many sides to people giving up their cars some people are truly happy and some are not it may not be that bad i mean how did people manige before cars were even invented
