## 1. Setup Development Environment



In [None]:
!pip install pytesseract googletrans==3.1.0a0 transformers==4.28.1 datasets evaluate rouge-score sentence_transformers nltk tensorboard py7zr --upgrade

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## Connect to Drive

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Load Dataset

In [7]:
def create_train_test(dataset):
    questions, answers = [], []
    for record in dataset['data']:
        title = record['title']
        paragraph = record['paragraphs'][0]
        context = paragraph['context']
        for q in paragraph['qas']:
            question = q['question']
            try: short_answer = q['answers'][0]['text']
            except: short_answer = "نامشخص"
            questions.append(question)
            answers.append(f"Title: {title}\nShort Answer: {short_answer}\nLong Answer: {context}")
    df = pd.DataFrame({'question': questions, 'answer': answers})
    return df

In [8]:
import json

f = open('/content/drive/MyDrive/Corpus/PersianQA/pqa_train.json')
train_set = json.load(f)
f = open('/content/drive/MyDrive/Corpus/PersianQA/pqa_test.json')
test_set = json.load(f)

df_train = create_train_test(train_set)
df_test = create_train_test(test_set)

In [9]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
})

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 9008
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 930
    })
})

In [None]:
# num_train_samples = 1000
# num_test_samples = 100

# train_dataset = dataset['train'].shuffle(seed=42).select(range(num_train_samples))
# test_dataset = dataset['test'].shuffle(seed=42).select(range(num_test_samples))

# dataset = DatasetDict({
#     "train": train_dataset,
#     "test": test_dataset
# })
# dataset

In [11]:
import googletrans
from googletrans import Translator

translator = Translator()

def translated_text(input_text):
    translated = translator.translate(input_text, src='fa', dest='en')
    return translated.text

In [18]:
from tqdm import tqdm

def translated_df(dataset):
    questions, answers, questions_en, answers_en = [], [], [], []
    for i in tqdm(range(len(dataset['question'])), desc="Translating..."):
        questions.append(dataset['question'][i])
        answers.append(dataset['answer'][i])
        questions_en.append(translated_text(dataset['question'][i]))
        answers_en.append(translated_text(dataset['answer'][i]))
    df = pd.DataFrame({'question': questions, 'answer': answers, 'question_en': questions_en, 'answer_en': answers_en})

In [20]:
df_train = translated_df(dataset['train'])
df_test = translated_df(dataset['test'])
dataset = DatasetDict({"train": Dataset.from_pandas(df_train), "test": Dataset.from_pandas(df_test)})

Translating...: 100%|██████████| 9008/9008 [50:00<00:00,  3.00it/s]
Translating...: 100%|██████████| 930/930 [03:03<00:00,  5.07it/s]


AttributeError: 'NoneType' object has no attribute 'columns'

In [None]:
dataset['train'][65]

{'title': 'عروسک',
 'context': 'عروسک اسباب\u200cبازی کودکان و گاهی نوجوانان است و معمولاً شبیه انسان ساخته می\u200cشود. عروسک به قوه تخیل کودکان کمک می\u200cکند بگونه\u200cای که هنگام بازی، کودک آن را فردی زنده تصور کرده و برای آن شخصیت و اعمال متصور می\u200cشود. با توجه به لزوم ایجاد یک پل ارتباطی بین بزرگسال و کودک که توسط آن بزرگسال بتواند هم کودک را بشناسد و هم دنیای خویش را به او معرفی کند، ساخت و ارائه ی اسباب\u200cبازی به کودک یکی از مؤثرترین راهکارهایی است که با درگیر کردن احساسات کودک می\u200cتواند وی را به واکاوی و جستجو در مفاهیم دنیای بزرگسالان وادار کند. از سوی دیگر کودکان در بازی\u200cهای خود استعدادشان را در زمینهٔ تجربه و قدرت تخیل\u200cشان به نمایش می\u200cگذارند. گاهی از عروسک\u200cها برای نمایش بجای انسان\u200cها استفاده می\u200cشود که به آن\u200cها عروسک خیمه\u200cشب\u200cبازی یا عروسک نمایشی گفته می\u200cشود. استفاده دختربچه ها از عروسک به مراتب بیشتر از پسرهاست و این برگرفته از حس مادرپنداری آنان در آینده است. آنچنان که اشاره گردید عروسک ها پلی ارتباطی میان دنیای

In [None]:
from random import randrange


sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"question: \n{sample['question']}\n---------------")
print(f"title: \n{sample['title']}\n---------------")
print(f"answer: \n{sample['answer']}\n---------------")
print(f"context: \n{sample['context']}\n---------------")

question: 
صومِعِه محل زندگی چه کسانی است؟
---------------
title: 
صومعه
---------------
answer: 
راهب‌ها و راهبه‌ها تحت سرپرستی یک رئیس مرد یا زن در آن زندگی می‌کنند
---------------
context: 
صومِعِه سازه‌ای است که راهب‌ها و راهبه‌ها تحت سرپرستی یک رئیس مرد یا زن در آن زندگی می‌کنند. در قرون وسطی صومعه‌های بسیاری در سراسر اروپا ساخته شدند. بعضی از صومعه‌ها کلیساهای زیبایی داشتند که زیر نظر آنها اداره می‌شدند. یک صومعه اغلب دارای یک محوطه باز با تالار بزرگ، ایوان سرپوشیده برای قدم زدن، مطالعه و تفکر راهبان و یک خوابگاه برای استراحت آن‌ها بود. درون صومعه آشپزخانه، اصطبل، انبار و اتاق میهمان نیز وجود داشت و درون دیوارهای آن باغچه‌های صیفی جات ایجاد شده بودند. راهبها غذای خود را در سالن بزرگ غذاخوری صرف می‌کردند. گاهی اوقات راهب‌ها و راهبه‌ها غذای خود را در سکوت کامل صرف می‌کردند و در حین غذا خوردن به آیاتی از کتاب مقدس گوش می‌دادند. آنها هر نوع کاری از جمله نظافت، آشپزی، نجاری، کشاورزی و پرورش زنبور عسل انجام می‌دادند. بعضی از صومعه‌ها به خاطر محصولاتشان مانند دارو و پنیر معروف بودند. را

To train our model we need to convert our inputs (text) to token IDs. This is done by a 🤗 Transformers Tokenizer. If you are not sure what this means check out [chapter 6](https://huggingface.co/course/chapter6/1?fw=tf) of the Hugging Face Course.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)


before we can start training we need to preprocess our data. Abstractive Summarization is a text2text-generation task. This means our model will take a text as input and generate a summary as output. For this we want to understand how long our input and output will be to be able to efficiently batch our data.

In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["question"], truncation=True), batched=True, remove_columns=["question", "title", "answer", "context"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), batched=True, remove_columns=["question", "title", "answer", "context"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Max source length: 35


Map:   0%|          | 0/1100 [00:00<?, ? examples/s]

Max target length: 126


In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["Question(Persian): " + item for item in sample["question"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["question", "title", "answer", "context"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## 3. Fine-tune and evaluate FLAN-T5

After we have processed our dataset, we can start training our model. Therefore we first need to load our [FLAN-T5](https://huggingface.co/models?search=flan-t5) from the Hugging Face Hub. In the example we are using a instance with a NVIDIA V100 meaning that we will fine-tune the `base` version of the model.
_I plan to do a follow-up post on how to fine-tune the `xxl` version of the model using Deepspeed._


In [None]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

We want to evaluate our model during training. The `Trainer` supports evaluation during training by providing a `compute_metrics`.
The most commonly used metrics to evaluate summarization task is [rogue_score](https://en.wikipedia.org/wiki/ROUGE_(metric)) short for Recall-Oriented Understudy for Gisting Evaluation). This metric does not behave like the standard accuracy: it will compare a generated summary against a set of reference summaries

We are going to use `evaluate` library to evaluate the `rogue` score.

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Before we can start training is to create a `DataCollator` that will take care of padding our inputs and labels. We will use the `DataCollatorForSeq2Seq` from the 🤗 Transformers library.

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


The last step is to define the hyperparameters (`TrainingArguments`) we want to use for our training. We are leveraging the [Hugging Face Hub](https://huggingface.co/models) integration of the `Trainer` to automatically push our checkpoints, logs and metrics during training into a repository.

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-event-extraction-train-on-3-test-on-1new"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=2,
    # logging & evaluation strategies
    # logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    # logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    # hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

We can start our training by using the `train` method of the `Trainer`.

In [None]:
# Start training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.2912,0.323156,0.0,0.0,0.0,0.0,19.0
2,0.2645,0.309138,0.0,0.0,0.0,0.0,19.0
3,0.2614,0.315873,0.0,0.0,0.0,0.0,19.0
4,0.2611,0.324281,0.0,0.0,0.0,0.0,19.0
5,0.2603,0.320544,0.0,0.0,0.0,0.0,19.0
6,0.2561,0.313153,0.0,0.0,0.0,0.0,19.0
7,0.2557,0.312134,0.0,0.0,0.0,0.0,19.0
8,0.2535,0.315113,0.0,0.0,0.0,0.0,19.0


TrainOutput(global_step=504, training_loss=0.2629631700969878, metrics={'train_runtime': 644.1421, 'train_samples_per_second': 12.42, 'train_steps_per_second': 0.782, 'total_flos': 427973345280000.0, 'train_loss': 0.2629631700969878, 'epoch': 8.0})

Nice, we have trained our model. 🎉 Lets run evaluate the best model again on the test set.


In [None]:
trainer.evaluate()

{'eval_loss': 0.3151134252548218,
 'eval_rouge1': 0.0,
 'eval_rouge2': 0.0,
 'eval_rougeL': 0.0,
 'eval_rougeLsum': 0.0,
 'eval_gen_len': 19.0,
 'eval_runtime': 4.8268,
 'eval_samples_per_second': 20.718,
 'eval_steps_per_second': 1.45,
 'epoch': 8.0}

In [None]:
# # Save our tokenizer and create model card
# tokenizer.save_pretrained(repository_id)
# trainer.create_model_card()
# # Push the results to the hub
# trainer.push_to_hub()

## Test And Evaluate on Rouge and SBERT

### Rouge Score

In [None]:
from rouge_score import rouge_scorer

def calculate_rouge_score(reference, candidate):
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', ], use_stemmer=True)
  scores = scorer.score(reference, candidate)
  return scores['rougeL']

In [None]:
samples_number = len(dataset['test'])

SUM = 0
for sample in dataset['test']:
  TEXT = "Question(Persian): " + sample['question']
  ground_truth = sample['answer']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  rouge = calculate_rouge_score(ground_truth, prediction)
  SUM += rouge[2] # rougeL fmeasure

  print(TEXT)
  print("prediction: ", prediction)
  print("ground_truth: ", ground_truth)

rouge_avg = SUM/samples_number
print(f"\nRougeL average on test set with {samples_number} samples: {rouge_avg}")

Question: حجیم ترین غده بدن انسان چیست؟
prediction:                                                                                                                                                                                                                                                                 
ground_truth:  جِگَر یا کَبِد


KeyboardInterrupt: 

### SBERT Score

In [None]:
from sentence_transformers import SentenceTransformer, util

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

def calculate_sbert_score(sentences1, sentences2):
    # Compute embedding for both lists
    embeddings1 = sbert_model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(sentences2, convert_to_tensor=True)

    # ompute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    output = torch.tensor([cosine_scores])
    return round(output.item(), 4)

In [None]:
samples_number = len(dataset['test'])

SUM = 0
for sample in dataset['test']:
  TEXT = "Events: " + sample['Sentence']
  ground_truth = sample['Events']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  sbert_score = calculate_sbert_score(ground_truth, prediction)
  SUM += sbert_score

  if sbert_score<0.2: print(f"\n[-] Sentence:{TEXT} \nground_truth: {ground_truth} \nprediction: {prediction} \nsimilarity score: {sbert_score}", '\n------------------------------')
  if sbert_score>0.9: print(f"\n[+] Sentence:{TEXT} \nground_truth: {ground_truth} \nprediction: {prediction} \nsimilarity score: {sbert_score}", '\n------------------------------')

sbert_score_avg = SUM/samples_number
print(f"\n\n\nSBERT Score Cosine Similarity Average on test set with {samples_number} samples: {sbert_score_avg}")


[+] Sentence:Events: B: %um I took them to %uh &Jill’s and they spent two days there and I guess she couldn’t take them and their mom and dad came.  
Context:
 
ground_truth: Event1: B took the kids to Jill's
Event2: The kids spent two days at Jill's
Event3: B guesses Jill couldn't take the kids
Event4: Jill couldn't take the kids
Event5: The kid's mom and dad came
 
prediction: Event1: B took the children to Jill's Event2: The children spent two days at Jill's Event3: B guesses Jill couldn't take the children Event4: The children's mom and dad came Event5: B took the children to Jill's Event6: The children spent two days at Jill's Event7: B guesses Jill couldn't take the children Event8: The children's mom and dad came Event9: The children's mom and dad came Event10: The children's mom and dad came Event11: The children's mom and dad came Event12: The children's mom and dad came Event13: The children's mom and dad came Event14: The children's mom and dad came Event15: The children's 

## 4. Run Inference

In [None]:
TEXT = "Events: A: What kind of car do you have now?"
inputs = tokenizer.encode_plus(TEXT, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Events Extracted with Flan-T5-Base LLM:\n{prediction}")

Events Extracted with Flan-T5-Base LLM:
Event1: A asks A what kind of car A has now Event2: A has now a car


In [None]:
from random import randrange

sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"Sentence: {sample['Sentence']}\nGround truth:\n{sample['Events']}\n---------------")
TEXT = "Events: " + sample['Sentence']
inputs = tokenizer.encode_plus(TEXT, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Events Extracted with Flan-T5-Base LLM:\n{prediction}")
calculate_rouge_score(sample['Events'], prediction)

Sentence: A: Right. When do you sleep then? 
Context:
utterance1: A: What else is going on? You’re busy with work and 
utterance2: B: yeah. I work ten at night to six in the morning. &Bo &Bo’s happy because %uh he don’t have to go back to day care.  

Ground truth:
Event1: A asks B when B sleeps then
Event2: B sleeps

---------------
Events Extracted with Flan-T5-Base LLM:
Event1: A asks B when do B and Bo sleep Event2: B and Bo sleep 


Score(precision=0.6666666666666666, recall=0.9090909090909091, fmeasure=0.7692307692307692)

### Run This Block just for show all results on Test set

In [None]:
for sample in dataset['test']:
  print(f"Sentence: {sample['Sentence']}\nGround truth:\n{sample['Events']}\n")
  TEXT = "Events: " + sample['Sentence']
  inputs = tokenizer.encode_plus(TEXT, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print(f"Events Extracted with Flan-T5-Base LLM:\n{prediction}")
  calculate_rouge_score(sample['Events'], prediction)
  print("-"*80)

## Save Pretrained Model

In [None]:
save_directory = "/content/drive/MyDrive/Common Ground Docs/Models/FlanT5_Event_Extraction_3_to_1_2previous"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

### load Model and Test pretrained model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(save_directory)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(save_directory)

In [None]:
pretrained_model.to('cuda')

Rouge

In [None]:
samples_number = len(dataset['test'])

SUM = 0
for sample in dataset['test']:
  TEXT = "Events: " + sample['Sentence']
  ground_truth = sample['Events']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = pretrained_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  rouge = calculate_rouge_score(ground_truth, prediction)
  SUM += rouge[2] # rougeL fmeasure

rouge_avg = SUM/samples_number
print(f"\nRougeL average on test set with {samples_number} samples: {rouge_avg}")


RougeL average on test set with 47 samples: 0.5899351297618687


SBERT

In [None]:
samples_number = len(dataset['test'])

SUM = 0
for sample in dataset['test']:
  TEXT = "Events: " + sample['Sentence']
  ground_truth = sample['Events']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  sbert_score = calculate_sbert_score(ground_truth, prediction)
  SUM += sbert_score

  if sbert_score<0.2: print(f"\n[-] Sentence:{TEXT} \nground_truth: {ground_truth} \nprediction: {prediction} \nsimilarity score: {sbert_score}", '\n------------------------------')
  if sbert_score>0.9: print(f"\n[+] Sentence:{TEXT} \nground_truth: {ground_truth} \nprediction: {prediction} \nsimilarity score: {sbert_score}", '\n------------------------------')

sbert_score_avg = SUM/samples_number
print(f"\n\n\nSBERT Score Cosine Similarity Average on test set with {samples_number} samples: {sbert_score_avg}")


[+] Sentence:Events: A: %um but although they solicit, they’re trying to solicit more throughout the, throughout the globe instead of just &Japan  
ground_truth: Event1: Although the company solicits, the company is trying to solicit more throughout the globe instead of just Japan
Event2: The company solicits
Event3: The company is trying to solicit more throughout the globe instead of just Japan
 
prediction: Event1: Although they solicit, they are trying to solicit more throughout the globe instead of just Japan Event2: They solicit more throughout the globe instead of just Japan  
similarity score: 0.914 
------------------------------

[+] Sentence:Events: A: although they’ve had most interest in &Japan.   
ground_truth: Event1: The company have had most interest in Japan
 
prediction: Event1: Although they have had most interest in Japan  
similarity score: 0.9059 
------------------------------

[+] Sentence:Events: A: And I teach probably two classes and then do administrative 