# Fine-tune FLAN-T5 for CG Classification

## 1. Setup Development Environment


In [None]:
!pip install pytesseract transformers==4.28.1 datasets evaluate rouge-score nltk tensorboard py7zr

# from huggingface_hub import notebook_login
# notebook_login()

### Connect to Drive

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 2. Load and prepare dataset

In [4]:
import pickle

f = open("/content/drive/MyDrive/Corpus/CG_Corpus/cg_3to1_2previous_event_selection.dat", "rb")
dataset = pickle.load(f)
f.close()
dataset

DatasetDict({
    train: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 970
    })
    test: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 325
    })
})

### Preprocess

In [5]:
import pandas as pd
from datasets import Dataset

train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
train_df['Bel(A)'] = train_df['Bel(A)'].astype(str)
test_df['Bel(A)'] = test_df['Bel(A)'].astype(str)
train_df['CG(A)'] = train_df['CG(A)'].astype(str)
test_df['CG(A)'] = test_df['CG(A)'].astype(str)
train_df['Bel(B)'] = train_df['Bel(B)'].astype(str)
test_df['Bel(B)'] = test_df['Bel(B)'].astype(str)
train_df['CG(B)'] = train_df['CG(B)'].astype(str)
test_df['CG(B)'] = test_df['CG(B)'].astype(str)
dataset['train'] = Dataset.from_pandas(train_df)
dataset['test'] = Dataset.from_pandas(test_df)
dataset['train'] = dataset['train'].shuffle()
dataset

DatasetDict({
    train: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 970
    })
    test: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 325
    })
})

In [6]:
train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['CG(A)'] != '0']
dataset['train'] = Dataset.from_pandas(train_df)
dataset['train'] = dataset['train'].remove_columns('__index_level_0__')

test_df = dataset['test'].to_pandas()
test_df = test_df[test_df['CG(A)'] != '0']
dataset['test'] = Dataset.from_pandas(test_df)
dataset['test'] = dataset['test'].remove_columns('__index_level_0__')
dataset

DatasetDict({
    train: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 889
    })
    test: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 311
    })
})

In [7]:
dataset['train'][20]

{'Speaker': 'A',
 'Sentence_Number': 60,
 'Sentence': 'None',
 'Event': 'Previous Sentences: A asks B if B is now not smoking B is now not smoking B is smoking A did a few cigarettes when B has been back too B did a few cigarettes \nTarget Sentence: B is back',
 'Target_Event': 'B is back',
 'Bel(A)': '1',
 'Bel(B)': '1',
 'CG(A)': '2',
 'CG(B)': '2'}

In [10]:
from random import randrange

sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"Input Event without Context: \n{sample['Target_Event']}\n")
print(f"Bel(A)={sample['Bel(A)']},Bel(B)={sample['Bel(B)']}")
print(f"CG(A)={sample['CG(A)']},CG(B)={sample['CG(B)']}")

Input Event without Context: 
Norm and B live in the suburbs

Bel(A)=1,Bel(B)=1
CG(A)=1,CG(B)=1


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

(…)-base/resolve/main/tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

(…)flan-t5-base/resolve/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

(…)ase/resolve/main/special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [12]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["Target_Event"], truncation=True), batched=True, remove_columns=['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["CG(A)"], truncation=True), batched=True, remove_columns=['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Max source length: 69


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Max target length: 2


In [13]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["Bel(A)=" + sample['Bel(A)'][i] + ",Bel(B)=" + sample['Bel(B)'][i] + "\n\nTarget_Event:\n" + sample['Target_Event'][i] for i in range(len(sample["Target_Event"]))]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["CG(A)"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## 3. Fine-tune and evaluate FLAN-T5

In [14]:
from transformers import AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

(…)le/flan-t5-base/resolve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

(…)base/resolve/main/generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

We want to evaluate our model during training. The `Trainer` supports evaluation during training by providing a `compute_metrics`.
The most commonly used metrics to evaluate summarization task is [rogue_score](https://en.wikipedia.org/wiki/ROUGE_(metric)) short for Recall-Oriented Understudy for Gisting Evaluation). This metric does not behave like the standard accuracy: it will compare a generated summary against a set of reference summaries

We are going to use `evaluate` library to evaluate the `rogue` score.

In [15]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("f1")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Before we can start training is to create a `DataCollator` that will take care of padding our inputs and labels. We will use the `DataCollatorForSeq2Seq` from the 🤗 Transformers library.

In [16]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


The last step is to define the hyperparameters (`TrainingArguments`) we want to use for our training. We are leveraging the [Hugging Face Hub](https://huggingface.co/models) integration of the `Trainer` to automatically push our checkpoints, logs and metrics during training into a repository.

In [17]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-event-extraction"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-4,

    num_train_epochs=12,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    # logging_steps=300,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    # push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

We can start our training by using the `train` method of the `Trainer`.

In [18]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Gen Len
1,0.2617,0.453375,64.3475,2.0
2,0.1394,0.325897,64.3475,2.0
3,0.1488,0.184356,78.668,2.0
4,0.0889,0.256827,72.4351,2.0
5,0.0587,0.326245,76.9514,2.0
6,0.0363,0.475434,76.9004,2.0
7,0.0176,0.482817,75.2182,2.0
8,0.0139,0.541313,75.8861,2.0
9,0.0028,0.63006,75.546,2.0
10,0.0012,0.675355,75.8861,2.0


TrainOutput(global_step=1344, training_loss=0.064286159889196, metrics={'train_runtime': 589.2993, 'train_samples_per_second': 18.103, 'train_steps_per_second': 2.281, 'total_flos': 1027264420675584.0, 'train_loss': 0.064286159889196, 'epoch': 12.0})

Nice, we have trained our model. 🎉 Lets run evaluate the best model again on the test set.


In [19]:
trainer.evaluate()

{'eval_loss': 0.7026482820510864,
 'eval_f1': 75.8861,
 'eval_gen_len': 2.0,
 'eval_runtime': 6.6945,
 'eval_samples_per_second': 46.456,
 'eval_steps_per_second': 5.826,
 'epoch': 12.0}

## 4. Run Inference and Classification Report


In [20]:
from tqdm.auto import tqdm
results_dict = {'Speaker': [], 'Sentence_Number': [], 'Sentence': [], 'Event': [], 'Target_Event': [], 'Bel(A)': [], 'Bel(B)': [], 'PCG(A)': [], 'PCG(B)': [], 'CG(A)': [], 'CG(B)': [],}

samples_number = len(dataset['test'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []
for i in range(samples_number):
  text = f"Bel(A)={dataset['test']['Bel(A)'][i]},Bel(B)={dataset['test']['Bel(B)'][i]}\n\nTarget_Event:\n{dataset['test']['Target_Event'][i]}"
  inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predictions_list.append(prediction)
  labels_list.append(dataset['test']['CG(A)'][i])

  results_dict['Speaker'].append(dataset['test']['Speaker'][i])
  results_dict['Sentence_Number'].append(dataset['test']['Sentence_Number'][i])
  results_dict['Sentence'].append(text)
  results_dict['Event'].append(dataset['test']['Event'][i])
  results_dict['Target_Event'].append(dataset['test']['Target_Event'][i])
  results_dict['Bel(A)'].append(dataset['test']['Bel(A)'][i])
  results_dict['Bel(B)'].append(dataset['test']['Bel(B)'][i])
  results_dict['PCG(A)'].append(prediction)
  results_dict['PCG(B)'].append(prediction)
  results_dict['CG(A)'].append(dataset['test']['CG(A)'][i])
  results_dict['CG(B)'].append(dataset['test']['CG(B)'][i])

  progress_bar.update(1)

  0%|          | 0/311 [00:00<?, ?it/s]

In [21]:
from sklearn.metrics import classification_report

report = classification_report(labels_list, predictions_list, zero_division=0)
print(report)

              precision    recall  f1-score   support

           1       0.92      0.98      0.95       245
           2       0.64      0.23      0.34        30
           3       0.97      1.00      0.99        36

    accuracy                           0.91       311
   macro avg       0.84      0.74      0.76       311
weighted avg       0.90      0.91      0.89       311



In [22]:
results_df = pd.DataFrame.from_dict(results_dict)

In [23]:
results_df

Unnamed: 0,Speaker,Sentence_Number,Sentence,Event,Target_Event,Bel(A),Bel(B),PCG(A),PCG(B),CG(A),CG(B)
0,B,1,"Bel(A)=1,Bel(B)=1\n\nTarget_Event:\nB took the...",Previous Sentences: \nTarget Sentence: B took ...,B took the kids to Jill's,1,1,1,1,1,1
1,B,1,"Bel(A)=1,Bel(B)=1\n\nTarget_Event:\nThe kids s...",Previous Sentences: B took the kids to Jill's ...,The kids spent two days at Jill's,1,1,1,1,1,1
2,B,1,"Bel(A)=1,Bel(B)=1\n\nTarget_Event:\nB guesses ...",Previous Sentences: B took the kids to Jill's ...,B guesses Jill couldn't take the kids,1,1,1,1,1,1
3,B,1,"Bel(A)=3,Bel(B)=3\n\nTarget_Event:\nJill could...",Previous Sentences: B took the kids to Jill's ...,Jill couldn't take the kids,3,3,1,1,1,1
4,B,1,"Bel(A)=1,Bel(B)=1\n\nTarget_Event:\nThe kid's ...",Previous Sentences: B took the kids to Jill's ...,The kid's mom and dad came,1,1,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...
306,A,144,"Bel(A)=3,Bel(B)=3\n\nTarget_Event:\nA's sons n...",Previous Sentences: A asks B if B knows what A...,A's sons never treat one another like A and B'...,3,3,1,1,1,1
307,B,145,"Bel(A)=1,Bel(B)=1\n\nTarget_Event:\nB doesn't ...",Previous Sentences: B doesn't know what A said...,B doesn't think B's kids will be like A and B'...,1,1,1,1,1,1
308,B,145,"Bel(A)=3,Bel(B)=3\n\nTarget_Event:\nB's kids w...",Previous Sentences: B doesn't know what A said...,B's kids will be like A and B's mom and dad,3,3,1,1,1,1
309,A,146,"Bel(A)=1,Bel(B)=1\n\nTarget_Event:\nA and B's ...",Previous Sentences: A said A hopes that A's so...,A and B's dad just looked at A,1,1,1,1,1,1


In [24]:
results_df.to_csv('results.csv')

### Using Bel(A) trained model for test on Bel(B)

we should remove Bel(B)=0 for calculate correct classification reports

In [25]:
test_df = dataset['test'].to_pandas()
test_df = test_df[test_df['CG(B)'] != '0']
dataset['test'] = Dataset.from_pandas(test_df)
dataset['test'] = dataset['test'].remove_columns('__index_level_0__')

In [26]:
from tqdm.auto import tqdm

samples_number = len(dataset['test'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []
for i in range(samples_number):
  text = f"Bel(A)={dataset['test']['Bel(A)'][i]},Bel(B)={dataset['test']['Bel(B)'][i]}\n\nTarget_Event:\n{dataset['test']['Target_Event'][i]}"
  inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predictions_list.append(prediction)
  labels_list.append(dataset['test']['CG(B)'][i])

  progress_bar.update(1)

  0%|          | 0/310 [00:00<?, ?it/s]

In [27]:
from sklearn.metrics import classification_report

report = classification_report(labels_list, predictions_list, zero_division=0)
print(report)

              precision    recall  f1-score   support

           1       0.92      0.98      0.95       245
           2       0.64      0.25      0.36        28
           3       1.00      1.00      1.00        37

    accuracy                           0.92       310
   macro avg       0.85      0.74      0.77       310
weighted avg       0.90      0.92      0.90       310

