In [5]:
#!pip install transformers
#!pip install datasets
#!pip install rouge_score
#!pip install nltk
#!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [1]:
import torch
import numpy as np
import pandas as pd
import datasets
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    AutoModel,
    RobertaModel, 
    RobertaTokenizer
)
from tabulate import tabulate
import model
import nltk
from datetime import datetime
from datasets import Dataset
import math
import warnings
warnings.filterwarnings("ignore")
import time
from data_utils import to_cuda, collate_mp, ReRankingDataset
from torch.utils.data import DataLoader
from compare_mt.rouge.rouge_scorer import RougeScorer
import time
device = 'cpu'

2022-12-10 16:13:17.397468: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [3]:
torch.cuda.is_available()

False

In [4]:
# load and preprocess dataset
# change to args parser if use py file
class my_args:
    def __init__(self,generator_name, pt_scorer_name, csv_data_name, decoder_max_len, num_cands, sum_max_len,cand_gen_batch):
        self.generator_name = generator_name
        self.csv_data_name = csv_data_name
        self.decoder_max_len = decoder_max_len # default to 30 since title are short
        self.num_cands = num_cands
        self.sum_max_len = sum_max_len
        self.pt_scorer_name = pt_scorer_name
args = my_args('tuned_t5_model', 'cache/scorer_5.bin', 'clean_covid.csv',30, 8, 30, 100)

In [5]:
# load model
model_name = args.generator_name
generator = AutoModelForSeq2SeqLM.from_pretrained(model_name)
generator_tokenizer = AutoTokenizer.from_pretrained(model_name)

encoder_max_length = 512 # default to 512
decoder_max_length = args.decoder_max_len

# read_dataset preprocess data
csv_data_name = pd.read_csv(args.csv_data_name)
data = csv_data_name[['abstract','title']].dropna()
dataset = Dataset.from_pandas(data)
train_data_txt, remain_data_txt = dataset.train_test_split(test_size=0.2).values()
val_data_txt, test_data_txt = remain_data_txt.train_test_split(test_size=0.5).values()
train_data_txt = train_data_txt.shuffle(seed = 2333).select(range(int(len(train_data_txt)/10)))
val_data_txt = val_data_txt.shuffle(seed = 2333).select(range(int(len(val_data_txt)/10)))
test_data_txt = test_data_txt.shuffle(seed = 2333).select(range(int(len(test_data_txt)/10)))
# tokenize data
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["abstract"], batch["title"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == generator_tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

test_data = test_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, generator_tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=test_data_txt.column_names,
)

  0%|          | 0/9 [00:00<?, ?ba/s]

In [57]:
test_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8148
})

In [58]:
generator.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [59]:
def to_tensor(ids):
    return torch.tensor(ids, dtype=torch.long).to(device)
def show_a_piece_of_data(generator,generator_tokenizer,sample):
    test_article = sample['abstract']
    reference = sample['title']
    num_return_seqs=16
    input_ids = torch.tensor(generator_tokenizer(test_article)['input_ids'], dtype=torch.long).to(device)
    train_output_ids = generator.generate(input_ids,num_beams = num_return_seqs,
                                     #no_repeat_ngram_size=2,
                                     diversity_penalty=1.0,
                                     max_length = 20,
                                     num_beam_groups = num_return_seqs,
                                     num_return_sequences=num_return_seqs)
    cands = generator_tokenizer.batch_decode(train_output_ids, skip_special_tokens=True)
    print(cands)
    print('ref'+'-'*20)
    print(reference)
    print('doc'+'-'*20)
    print(test_article)

In [20]:
show_a_piece_of_data(generator,generator_tokenizer,test_data_txt[0:1])

['the university of british columbia undergraduate program mdup students and', 'impact of the covid pandemic on medical education the university of british col', 'collaboration between university of british columbia undergraduate students and faculty during the co', 'a joint response to covid disruptions in medical education', 'the covid pandemic and the medical student response team in the university of brit', 'university of british columbia undergraduate program students and faculty a joint response', 'a joint response to covid disruptions in medical education a collaboration between the university of', 'impact of the covid pandemic on medical education', 'covid and the university of british columbia undergraduate program a joint', 'covid and the university of british columbia undergraduate program', 'covid and medical student response team a joint response to the pandemic', 'a covid medical student response team a nimble organizational structure for the university', 'covid and medic

In [81]:
# load scorer_tokenizer
scorer_name = 'roberta-base'
pt_model_name = 'saved_models/scorer_10.bin'
scorer_tokenizer = RobertaTokenizer.from_pretrained(scorer_name)
scorer = model.ReRanker(scorer_name, scorer_tokenizer.pad_token_id)
scorer.load_state_dict(torch.load(pt_model_name,map_location=torch.device(device)))
scorer.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ReRanker(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [91]:
def evaluate_without_SimCLS(generator, test_data_txt, cand_num, show_results):
    rouge_scorer = RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)
    doc_txt = test_data_txt['abstract']
    doc_ids = generator_tokenizer.batch_encode_plus(doc_txt, max_length = 512, pad_to_max_length=True)['input_ids']
    doc_ids = to_tensor(doc_ids)
    doc_input_mask = doc_ids != scorer_tokenizer.pad_token_id
    doc_out = scorer.encoder(doc_ids, attention_mask=doc_input_mask)['last_hidden_state']
    doc_emb = torch.mean(doc_out,dim = 1) # average over all word embeddings
    
    ref_txt = test_data_txt['title']
    ref_ids = generator_tokenizer.batch_encode_plus(ref_txt, max_length = 512, pad_to_max_length=True)['input_ids']
    ref_ids = to_tensor(ref_ids)
    ref_input_mask = ref_ids != scorer_tokenizer.pad_token_id
    ref_out = scorer.encoder(ref_ids, attention_mask=ref_input_mask)['last_hidden_state']
    ref_emb = torch.mean(ref_out,dim = 1) 
    
    cand_id = generator.generate(doc_ids,num_beams = 16,
                                     #no_repeat_ngram_size=2,
                                     diversity_penalty=1.0,
                                     max_length = 20,
                                     num_beam_groups = cand_num,
                                     num_return_sequences = 1)
    cands_txt = generator_tokenizer.batch_decode(cand_id, skip_special_tokens=True)
    candidate_id = cand_id.view(-1, cand_id.size(-1))
    cand_input_mask = candidate_id != scorer_tokenizer.pad_token_id
    cand_out = scorer.encoder(candidate_id, attention_mask=cand_input_mask)['last_hidden_state'] 
    candidate_embs = torch.mean(cand_out,dim = 1)
    
    cand_similarity_score = torch.cosine_similarity(candidate_embs, doc_emb, dim=-1).item()
    ref_similarity_score = torch.cosine_similarity(ref_emb, doc_emb, dim=-1).item()
    
    cands_rouge_scores = rouge_scorer.score(cands_txt[0],ref_txt[0])
    rouge1_scores = cands_rouge_scores['rouge1'].fmeasure
    rouge2_scores = cands_rouge_scores['rouge2'].fmeasure
    rougeL_scores = cands_rouge_scores['rougeLsum'].fmeasure
    
    if show_results:
        print('doc'+'-'*50)
        print(doc_txt)
        print('ref'+'-'*50)
        print(ref_txt)
        print('cand'+'-'*49)
        print(cands_txt)
        print('scores:'+'-'*50)
        print(f'rouge1: {rouge1_scores}, rouge2: {rouge2_scores}, rougeL: {rougeL_scores}')
        print(f'cand similarity: {cand_similarity_score}, ref similarity: {ref_similarity_score}')
    regular_scores = {'rouge1': rouge1_scores, 
                    'rouge2': rouge2_scores,
                    'rougeL': rougeL_scores,
                    'similar': cand_similarity_score,
                    'ref_similar':ref_similarity_score}
    
    return regular_scores,cands_txt

def evaluate_SimCLS(generator, generator_tokenizer, scorer, scorer_tokenizer, 
                    test_data_txt, cand_num, show_piece_of_data):
    # generate batch data
    rouge_scorer = RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)
    # 1, encode doc
    doc_txt = test_data_txt['abstract']
    doc_ids = generator_tokenizer.batch_encode_plus(doc_txt, max_length = 512, pad_to_max_length=True)['input_ids']
    doc_ids = to_tensor(doc_ids)
    # 2, encode true sum
    ref_txt = test_data_txt['title']
    ref_ids = generator_tokenizer.batch_encode_plus(ref_txt, max_length = 512, pad_to_max_length=True)['input_ids']
    ref_ids = to_tensor(ref_ids)
    # 3, generate cands
    cands_ids = generator.generate(doc_ids,num_beams = cand_num,
                                     #no_repeat_ngram_size=2,
                                     diversity_penalty=1.0,
                                     max_length = 20,
                                     num_beam_groups = cand_num,
                                     num_return_sequences = cand_num)
    cands_txt = generator_tokenizer.batch_decode(cands_ids, skip_special_tokens=True)
    # 4, get sentence embeddings
        # doc emb
    doc_input_mask = doc_ids != scorer_tokenizer.pad_token_id
    doc_out = scorer.encoder(doc_ids, attention_mask=doc_input_mask)['last_hidden_state']
    doc_emb = torch.mean(doc_out,dim = 1) # average over all word embeddings
        # cands emb
    candidate_id = cands_ids.view(-1, cands_ids.size(-1))
    cand_input_mask = candidate_id != scorer_tokenizer.pad_token_id
    cand_out = scorer.encoder(candidate_id, attention_mask=cand_input_mask)['last_hidden_state'] 
    candidate_embs = torch.mean(cand_out,dim = 1)
        # ref emb
    ref_input_mask = ref_ids != scorer_tokenizer.pad_token_id
    ref_out = scorer.encoder(ref_ids, attention_mask=ref_input_mask)['last_hidden_state']
    ref_emb = torch.mean(ref_out,dim = 1) 
    
    similarity_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    for i in range(cand_num):
        score = torch.cosine_similarity(candidate_embs[i], doc_emb, dim=-1).item()
        similarity_scores.append(score)
        cands_rouge_scores = rouge_scorer.score(cands_txt[i],ref_txt[0])
        rouge1_scores.append(cands_rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(cands_rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(cands_rouge_scores['rougeLsum'].fmeasure)
        
    ref_similarity_score = torch.cosine_similarity(ref_emb, doc_emb, dim=-1).item()

    if show_piece_of_data:
        print('-'*50)
        show_a_piece_of_data(generator, generator_tokenizer, test_data_txt)
    max_index = similarity_scores.index(max(similarity_scores))
    top1_scores = {'rouge1': rouge1_scores[max_index], 
                    'rouge2': rouge2_scores[max_index],
                    'rougeL': rougeL_scores[max_index],
                    'similar': similarity_scores[max_index],
                    'ref_similar': ref_similarity_score}
    
    return top1_scores, cands_txt, max_index

In [108]:
# evaluate loop 
rouge1_noSimCLS = []
rouge2_noSimCLS = []
rougeL_noSimCLS = []

rouge1_SimCLS = []
rouge2_SimCLS = []
rougeL_SimCLS = []

references = []
regular_cand_pred = []
SimCLS_cand_pred = []

num = 112
for i in range(num,num+1):
    time_start = time.time()
    
    regular_scores,cand_txt = evaluate_without_SimCLS(
        generator, test_data_txt[i:i+1], 16, False)
    
    rouge1_noSimCLS.append(regular_scores['rouge1'])
    rouge2_noSimCLS.append(regular_scores['rouge2'])
    rougeL_noSimCLS.append(regular_scores['rougeL'])
    
    
    SimCLS_sores, cands_txt, top1_index = evaluate_SimCLS(generator, generator_tokenizer, scorer, 
                    scorer_tokenizer, test_data_txt[i:i+1], 16, False)
    
    rouge1_SimCLS.append(SimCLS_sores['rouge1'])
    rouge2_SimCLS.append(SimCLS_sores['rouge2'])
    rougeL_SimCLS.append(SimCLS_sores['rougeL'])
    
    references.append(test_data_txt[i]['title'])
    regular_cand_pred.append(cand_txt[0])
    SimCLS_cand_pred.append(cands_txt[top1_index])
    time_end = time.time()
    
    time_used = time_end - time_start
    print(f'current working sample: {i+1}, time used last sample: {round(time_used,4)}', end = '\r')
    
print('doc'+ '-'*50)
print(test_data_txt[num]['abstract'])
print('ref'+ '-'*50)
print(f'{references[0]}')
print('no SimCLS' + '-'*50)
print(f'{regular_cand_pred[0]} \n rouge:{rouge1_noSimCLS[0], rouge2_noSimCLS[0],rougeL_noSimCLS[0]}')
print('with SimCLS' + '-'*50)
print(f'{SimCLS_cand_pred[0]} \n rouge:{rouge1_SimCLS[0], rouge2_SimCLS[0],rougeL_SimCLS[0]}')

current working sample: 113, time used last sample: 6.1196doc--------------------------------------------------
health care providers have an ethical obligation to reduce suffering during a patient's end of life eol but few receive formal education on eol care principles the objective of this project was to determine the feasibility and potential benefits of an education initiative in which the principles of eol care were taught to senior level nursing students and practicing nurses to assess feasibility data regarding recruitment rates retention rates and implementation issues were collected workshop effectiveness was evaluated through use of the end of life nursing education consortium knowledge assessment test survey which evaluates knowledge levels regarding eol care principles a mixed effects linear model was used to test for changes from the preworkshop to postworkshop scores demographic information and satisfaction data were also collected nineteen students and nurses participa

In [39]:
print(f'Before SimCLS ROUGE1: {np.mean(rouge1_noSimCLS)}, ROUGE2: {np.mean(rouge2_noSimCLS)}, ROUGEL: {np.mean(rougeL_noSimCLS)}')
print(f'After SimCLS ROUGE1: {np.mean(rouge1_SimCLS)}, ROUGE2: {np.mean(rouge2_SimCLS)}, ROUGEL: {np.mean(rougeL_SimCLS)}')

Before SimCLS ROUGE1: 0.4528457018408491, ROUGE2: 0.240739539283891, ROUGEL: 0.3880134130034644
After SimCLS ROUGE1: 0.4063123656705362, ROUGE2: 0.19135385057119345, ROUGEL: 0.33850068587817395


In [51]:
table.rename({0:'Rouge1', 1:'Rouge2', 2:'RougeL'})

Unnamed: 0,Before SimCLS,After SimCLS
Rouge1,0.452846,0.406312
Rouge2,0.24074,0.191354
RougeL,0.388013,0.338501
