In [None]:
! pip install transformers datasets rouge-score sentence_transformers

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

# # 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

## What is SBERT Score?

https://www.sbert.net/

SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. The initial work is described in our paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.

You can use this framework to compute sentence / text embeddings for more than 100 languages. These embeddings can then be compared e.g. with cosine-similarity to find sentences with a similar meaning. This can be useful for semantic textual similar, semantic search, or paraphrase mining.

The framework is based on PyTorch and Transformers and offers a large collection of pre-trained models tuned for various tasks. Further, it is easy to fine-tune your own models.

In [None]:
from sentence_transformers import SentenceTransformer, util

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
# Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

# Sentences are encoded by calling model.encode()
embeddings = sbert_model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173571e-02 -4.28515747e-02 -1.56286322e-02  1.40537573e-02
  3.95537876e-02  1.21796302e-01  2.94333398e-02 -3.17524038e-02
  3.54959406e-02 -7.93140233e-02  1.75878070e-02 -4.04369384e-02
  4.97259684e-02  2.54912674e-02 -7.18699992e-02  8.14968422e-02
  1.47072889e-03  4.79627214e-02 -4.50336076e-02 -9.92174670e-02
 -2.81769335e-02  6.45046011e-02  4.44670618e-02 -4.76217307e-02
 -3.52952592e-02  4.38671708e-02 -5.28565943e-02  4.33019857e-04
  1.01921454e-01  1.64072476e-02  3.26996669e-02 -3.45986858e-02
  1.21339522e-02  7.94871226e-02  4.58342955e-03  1.57778468e-02
 -9.68206488e-03  2.87626423e-02 -5.05806543e-02 -1.55793941e-02
 -2.87907012e-02 -9.62280575e-03  3.15556526e-02  2.27349252e-02
  8.71449187e-02 -3.85027975e-02 -8.84718895e-02 -8.75499751e-03
 -2.12343074e-02  2.08924208e-02 -9.02078152e-02 -5.25732376e-02
 -1.05638430e-02  2.88311411e-02 -1.61455106e-02  6.17842469e-03
 -1.23234

### Usage

https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [4]:
# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = sbert_model.encode(sentences1, convert_to_tensor=True)
embeddings2 = sbert_model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [5]:
from sentence_transformers import SentenceTransformer, util
import torch

def calculate_sbert_score(sentences1, sentences2):
    # Compute embedding for both lists
    embeddings1 = sbert_model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(sentences2, convert_to_tensor=True)

    # ompute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    output = torch.tensor([cosine_scores])
    return round(output.item(), 4)

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pickle

f = open("/content/drive/MyDrive/Corpus/CG_Corpus/event_extraction_3to1.dat", "rb")
dataset = pickle.load(f)
f.close()

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Events'],
        num_rows: 415
    })
    test: Dataset({
        features: ['Sentence', 'Events'],
        num_rows: 146
    })
})

## Load Pretrained Model

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

save_directory = "/content/drive/MyDrive/Common Ground Docs/Models/FlanT5_Event_Extraction_3_to_1"
tokenizer = AutoTokenizer.from_pretrained(save_directory)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(save_directory)

In [None]:
pretrained_model.to('cuda')

### Rouge Average

In [12]:
from rouge_score import rouge_scorer

def calculate_rouge_score(reference, candidate):
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', ], use_stemmer=True)
  scores = scorer.score(reference, candidate)
  return scores['rougeL']

In [13]:
samples_number = len(dataset['test'])

SUM = 0
for sample in dataset['test']:
  TEXT = "Events: " + sample['Sentence']
  ground_truth = sample['Events']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = pretrained_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  rouge = calculate_rouge_score(ground_truth, prediction)
  SUM += rouge[2] # rougeL fmeasure

rouge_avg = SUM/samples_number
print(f"\nRougeL average on test set with {samples_number} samples: {rouge_avg}")


RougeL average on test set with 146 samples: 0.487449564110134


### SBERT Average

In [14]:
from sentence_transformers import SentenceTransformer, util
import torch

def calculate_sbert_score(sentences1, sentences2):
    # Compute embedding for both lists
    embeddings1 = sbert_model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(sentences2, convert_to_tensor=True)

    # ompute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    output = torch.tensor([cosine_scores])
    return round(output.item(), 4)

In [15]:
samples_number = len(dataset['test'])

SUM = 0
for sample in dataset['test']:
  TEXT = "Events: " + sample['Sentence']
  ground_truth = sample['Events']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = pretrained_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  sbert_score = calculate_sbert_score(ground_truth, prediction)
  SUM += sbert_score

  if sbert_score<0.2: print(f"\n[-] Sentence:{TEXT} \nground_truth: {ground_truth} \nprediction: {prediction} \nsimilarity score: {sbert_score}", '\n------------------------------')
  if sbert_score>0.9: print(f"\n[+] Sentence:{TEXT} \nground_truth: {ground_truth} \nprediction: {prediction} \nsimilarity score: {sbert_score}", '\n------------------------------')

sbert_score_avg = SUM/samples_number
print(f"\n\n\nSBERT Score Cosine Similarity Average on test set with {samples_number} samples: {sbert_score_avg}")


[+] Sentence:Events: B: yeah.   
ground_truth: Event1: B says yeah.  
 
prediction: Event1: B says yeah.  
similarity score: 1.0 
------------------------------

[+] Sentence:Events: A: Darn it. I thought I was going to get to see everybody.   
ground_truth: Event1: A thought B was going to get to see everybody
Event2: B was going to get to see everybody
Event3: B got to see everybody
 
prediction: Event1: A thought B was going to get to see everybody Event2: B was going to get to see everybody  
similarity score: 0.9717 
------------------------------

[+] Sentence:Events: A: What kind of car do you have?  
ground_truth: Event1: A asks B what kind of car B has
Event2: B has a car
 
prediction: Event1: A asks B what kind of car do A have Event2: A has a car  
similarity score: 0.9801 
------------------------------

[+] Sentence:Events: B: Same car.   
ground_truth: Event1: B has the same car
 
prediction: Event1: B says Same car  
similarity score: 0.9378 
---------------------------

## Create CSV File for Event Extraction Model

In [17]:
events_dict = {
    'Input Sentence': [],
    'Ground Truth Events': [],
    'FlanT5 Generated Events': [],
    'SBERT Similarity Score': [],
    'rouge1': [],
    'rouge2': [],
    'rougeL': [],
    'rougeLsum': [],
    'Difflib Similarity Score': [],
}

In [18]:
from rouge_score import rouge_scorer

def calculate_rouge_score(reference, candidate):
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
  scores = scorer.score(reference, candidate)
  return scores

In [19]:
import difflib

def cal_similarity_ratio(target_text, memory):
  similarity = difflib.SequenceMatcher(None, target_text, memory).ratio()
  return similarity

In [20]:
for sample in dataset['test']:
  TEXT = "Events: " + sample['Sentence']
  ground_truth = sample['Events']
  inputs = tokenizer(TEXT, return_tensors="pt").to('cuda')
  outputs = pretrained_model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  sbert_score = calculate_sbert_score(ground_truth, prediction)
  rouge1 = calculate_rouge_score(ground_truth, prediction)['rouge1'][2]
  rouge2 = calculate_rouge_score(ground_truth, prediction)['rouge2'][2]
  rougeL = calculate_rouge_score(ground_truth, prediction)['rougeL'][2]
  rougeLsum = calculate_rouge_score(ground_truth, prediction)['rougeLsum'][2]
  difflib_score = cal_similarity_ratio(ground_truth, prediction)

  events_dict['Input Sentence'].append(TEXT)
  events_dict['Ground Truth Events'].append(ground_truth)
  events_dict['FlanT5 Generated Events'].append(prediction)
  events_dict['SBERT Similarity Score'].append(sbert_score)
  events_dict['rouge1'].append(rouge1)
  events_dict['rouge2'].append(rouge2)
  events_dict['rougeL'].append(rougeL)
  events_dict['rougeLsum'].append(rougeLsum)
  events_dict['Difflib Similarity Score'].append(difflib_score)

In [21]:
events_df = pd.DataFrame.from_dict(events_dict)

In [22]:
events_df

Unnamed: 0,Input Sentence,Ground Truth Events,FlanT5 Generated Events,SBERT Similarity Score,rouge1,rouge2,rougeL,rougeLsum,Difflib Similarity Score
0,Events: B: %um I took them to %uh &Jill’s and ...,Event1: B took the kids to Jill's\nEvent2: The...,Event1: B took the baby and the babysitter to ...,0.8440,0.515464,0.336842,0.515464,0.494845,0.371257
1,Events: A: You mean at te- in &Texas?,Event1: A asks B if B took the kids to Texas\n...,Event1: A asks B if B means at Te- in Texas Ev...,0.7602,0.555556,0.411765,0.555556,0.555556,0.675325
2,Events: B: yeah.,Event1: B says yeah. \n,Event1: B says yeah.,1.0000,1.000000,1.000000,1.000000,1.000000,0.954545
3,Events: B: And their mom and dad drove down th...,Event1: The kids' mom and dad drove down to Te...,Event1: B's friends's mom and dad drove down t...,0.7047,0.622222,0.418605,0.577778,0.577778,0.719626
4,Events: A: So how are they getting back? Drivi...,Event1: A asks B how are the kids getting back...,Event1: A asks B how the nurses are getting ba...,0.6928,0.622222,0.372093,0.577778,0.622222,0.368852
...,...,...,...,...,...,...,...,...,...
141,Events: A: Dad [distortion] You know what I sa...,Event1: A asks B if B knows what A said to A a...,Event1: A said to dad when A's dad met A at th...,0.8231,0.515837,0.310502,0.461538,0.470588,0.085202
142,Events: B: What?,Event1: B doesn't know what A said to A and B'...,Event1: B asks A what is the baby's name Event...,0.6105,0.317460,0.032787,0.222222,0.222222,0.308300
143,"Events: A: I said I hope that my, my sons neve...",Event1: A said A hopes that A's sons never tre...,"Event1: A said A hopes that A's, her sons neve...",0.8681,0.516854,0.436782,0.516854,0.494382,0.595420
144,Events: B: True. I don’t think my kids will be...,Event1: B doesn't think B's kids will be like ...,Event1: B doesn't think B's kids will be that ...,0.8470,0.454545,0.428571,0.454545,0.454545,0.541176


In [23]:
events_df.to_csv('Event_Extraction_Results_3_to_1.csv')