In [None]:
!pip install datasets
!pip install nltk


In [None]:
import torch
from datasets import load_dataset
from transformers import pipeline
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.tokenize import sent_tokenize
import nltk

In [None]:
nltk.download('punkt')


In [None]:
import getpass

hf_token = getpass.getpass('get your pass')

# Save the token as a secret named HF_TOKEN, use your hugging face token to run
%env HF_TOKEN=$hf_token

In [None]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
pretrained_model = "t5-small"

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

In [None]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model_reference=TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model)

In [None]:
billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)

In [None]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=64, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In this section, we'll have a close look to input_ids, labels,i.e, truncated docs and truncated summaries that we'll use to fine-tune the model

In [None]:
test_doc=tokenized_billsum['test'][0]['input_ids']
print(tokenizer.decode(test_doc))

summarize: The people of the State of California do enact as follows: SECTION 1. Section 836 of the Penal Code is amended to read: 836. (a) A peace officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5 (commencing with Section 830) of Title 3 of Part 2, without a warrant, may arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to be arrested has committed a public offense in the officer’s presence. (2) The person arrested has committed a felony, although not in the officer’s presence. (3) The officer has probable cause to believe that the person to be arrested has committed a felony, whether or not a felony, in fact, has been committed. (b) Any time a peace officer is called out on a domestic violence call, it shall be mandatory that the officer make a good faith effort to inform the victim of his or her right to make a citizen

In [None]:
sum_doc=tokenized_billsum['test'][0]['labels']
print(tokenizer.decode(sum_doc))

Existing law authorizes a peace officer to arrest a person without a warrant if the officer has probable cause to believe that the person has committed a public offense in the officer’s presence or if the officer has probable cause to believe that the person has committed a felony. This</s>


In this section, we'll try to generate a summary for the truncated text by from the pretrained model t5-small

In [None]:
def summarize_text(text, tokenizer, model):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=80, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def summarize_dataset(dataset, tokenizer, model):
    summaries = []
    for text in dataset:
        summary = summarize_text(text, tokenizer, model)
        summaries.append(summary)
    return summaries

**Attention**

Here, I attempt to compare the BLANC score between the truncated reference summaries and the summaries generated by a pre-trained model to determine if the chunking method truly works. However, due to the limited computational resources on my Colab, I am **unable to perform these tests**

I don't recommend you to launch these 2 cells because it will take a lot of time.

In [None]:
#We aim to retrieve the truncated training dataset in order to test if the chunking method truly works or not
#The result of score and plot will just base on a dataset of 20 truncated summaries from dataset and 20 summaries generated by pretrained_model
truncated_dataset_test_text = []
truncated_dataset_test_summary=[]
for example in tokenized_billsum['test']:
    input_ids = example['input_ids']
    labels = example['labels']
    decoded_text = tokenizer.decode(input_ids)
    decoded_summary = tokenizer.decode(labels)
    truncated_dataset_test_text.append(decoded_text)
    truncated_dataset_test_summary.append(decoded_summary)


In [None]:
pretrained_summaries_dataset_test=summarize_dataset(truncated_dataset_test_text[:20],tokenizer,model)

Summary generated by pretrained model

In [None]:
summary=summarize_text(tokenizer.decode(test_doc),tokenizer,model_reference)

In [None]:
summary

'a peace officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5. the officer has probable cause to believe that the person to be arrested has committed a felony, although not in the officer’s presence.'

We train the model from here

In [None]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=0.0005, weight_decay_rate=0.01)

In [None]:
train_dataset = tokenized_billsum["train"].to_tf_dataset(
    batch_size=16,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_billsum["test"].to_tf_dataset(
    batch_size=16,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
model.compile(optimizer=optimizer)
model.fit(train_dataset, validation_data=test_dataset, epochs=2)

**Result for 6 epochs:**

1st epoch: loss= 2.71, val_loss=2.26

2nd epoch: loss= 2.29, val_loss=2.13

3rd epoch: loss= 2.078, val_loss=2.08

4th epoch: loss= 1.89, val_loss=2.03

5th epoch: loss= 1.75, val_loss=2.03

6th epoch: loss=1.62, val_loss=2.00

In [None]:
#Summary after 2 first epochs
summary1=summarize_text(tokenizer.decode(test_doc),tokenizer,model)

In [None]:
summary1

'Existing law authorizes a peace officer to arrest a person in obedience to a warrant, warrant, or, pursuant to the authority granted to him or her by Chapter 4.5, without a warrant, to arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to be arrested has committed a public offense'

In [None]:
#Summary after trained 6 epochs in total
summary2=summarize_text(tokenizer.decode(test_doc),tokenizer,model)
summary2

'Existing law authorizes a peace officer to arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her, without a warrant to arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to'

**ADJUST BLANC SCORE CALCULATION**

In this part, we mainly focus on building the adjustment of Blanc score calculation in order to get a reasonable result.

In [None]:
from transformers import BertForMaskedLM, BertTokenizer
model_name = 'bert-base-uncased'
model_BLANC = BertForMaskedLM.from_pretrained(model_name)
tokenizer_BLANC = BertTokenizer.from_pretrained(model_name)

In [None]:
#the the filler that has '.' = length of summary
def initialize_filler(summary):
    filler = '.' * len(summary)
    return filler

In [None]:
#for the filler that has '.'=number of tokens in summary
def initialize_filler(summary):
    tokenized_summary = tokenizer_BLANC(summary)
    num_tokens = len(tokenized_summary['input_ids'])
    filler = '.' * num_tokens
    return filler

In [None]:
def mask_words(i_0, L_min, M, sentence):
    words = sentence.split()
    masked_sentence = []
    masked_tokens = []  # List to store original tokens that were masked
    for i, word in enumerate(words, 1):
        if (i - i_0) % M == 0 and len(word) >= L_min:
            masked_sentence.append('[MASK]')
            masked_tokens.append(word)
        else:
            masked_sentence.append(word)
    masked_sentence = ' '.join(masked_sentence)
    return masked_sentence, masked_tokens

In [None]:
# Example usage:
i_0 = 2
L_min = 4
M = 6
sentence = " A peace officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5 (commencing with Section 830) of Title 3 of Part 2, without a warrant, may arrest a person warrant whenever any of the following circumstances occur"
masked_sentence,masked_tokens = mask_words(i_0, L_min, M, sentence)
print(masked_sentence)
print(masked_tokens)
print(type(masked_tokens))


A [MASK] officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5 (commencing with Section 830) of [MASK] 3 of Part 2, without a warrant, may arrest a person [MASK] whenever any of the following [MASK] occur
['peace', 'Title', 'warrant', 'circumstances']
<class 'list'>


In [None]:
def no_copy_pair_guard(document, summary):
    doc_sentences = document.split('.')
    summary_sentences = summary.split('.')

    doc_sentences = [sentence.strip() for sentence in doc_sentences if sentence.strip()]
    summary_sentences = [sentence.strip() for sentence in summary_sentences if sentence.strip()]

    # Create a set of unique sentences in the summary
    unique_summary_sentences = set(summary_sentences)

    # Remove sentences from the document if they are identical to any sentence in the summary
    filtered_document = []
    for sentence in doc_sentences:
        if sentence not in unique_summary_sentences:
            filtered_document.append(sentence)

    filtered_document_text = '. '.join(filtered_document)

    return filtered_document_text

In [None]:
document = "This is a document. It contains sentences. Some sentences may repeat.Gotcha"
summary = "This is a summary. It contains sentences. Some sentences may repeat.Gotyou"

filtered_document = no_copy_pair_guard(document, summary)
print(filtered_document)

This is a document. Gotcha


In [None]:
def predict_masked_words(document, model, tokenizer):
    # Tokenize doc
    tokenized_text = tokenizer.tokenize(document)

    predicted_tokens_list = []

    for i, token in enumerate(tokenized_text):
        if token == '[MASK]':
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

            tokens_tensor = torch.tensor([indexed_tokens])

            # Predict masked token
            with torch.no_grad():
                outputs = model(tokens_tensor)
                predictions = outputs[0][0, i].topk(1)  # Take a best prediction

            predicted_tokens = [tokenizer.convert_ids_to_tokens(index.item()) for index in predictions.indices]

            predicted_tokens_list.append(predicted_tokens)

    predicted_tokens_list=[item for sublist in predicted_tokens_list for item in sublist]#flatten the list of list

    return predicted_tokens_list

In [None]:
predict_masked_words("A [MASK] officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5 (commencing with Section 830) of [MASK] 3 of Part 2, without a warrant, may arrest a person [MASK] whenever any of the following [MASK] occur",model_BLANC, tokenizer_BLANC)

['an', 'section', ',', 'shall']

This Part is dedicated to Cosine similarity Experiments

In [None]:
def cos_similarity(token1, token2, model, tokenizer):
    token_ids_1 = tokenizer.encode(token1, add_special_tokens=False, return_tensors='pt')
    token_ids_2 = tokenizer.encode(token2, add_special_tokens=False, return_tensors='pt')

    outputs_1 = model(token_ids_1)
    outputs_2 = model(token_ids_2)

    # Extract embeddings from the output
    embeddings_1 = outputs_1[0][:, 0, :]
    embeddings_2 = outputs_2[0][:, 0, :]

    # Compute dot product of embeddings_1 and embeddings_2
    dot_product = torch.sum(embeddings_1 * embeddings_2)

    # Compute L2 norms of embeddings_1 and embeddings_2
    norm_1 = torch.norm(embeddings_1)
    norm_2 = torch.norm(embeddings_2)

    # Compute cosine similarity
    cosine_similarity = dot_product / (norm_1 * norm_2)

    return cosine_similarity

In [None]:
#TEST
token1 = "man"
token2 = "boy"
similarity = cos_similarity(token1, token2, model_BLANC, tokenizer_BLANC)
print(f"Cosine similarity between '{token1}' and '{token2}': {similarity}")

Cosine similarity between 'man' and 'boy': 0.9007138609886169


In [None]:
#TEST
token1 = "man"
token2 = "man"
similarity = cos_similarity(token1, token2, model_BLANC, tokenizer_BLANC)
print(f"Cosine similarity between '{token1}' and '{token2}': {similarity}")

Cosine similarity between 'man' and 'man': 0.9999997019767761


In [None]:
#TEST
token1 = "cat"
token2 = "dog"
similarity = cos_similarity(token1, token2, model_BLANC, tokenizer_BLANC)
print(f"Cosine similarity between '{token1}' and '{token2}': {similarity}")

Cosine similarity between 'cat' and 'dog': 0.9107443690299988


Build BLANC-help and test it with the summaries generated by the model.

In [None]:
def BLANC_help(summary,text,model,tokenizer,M,L_min):
  filler=initialize_filler(summary)
  S01, S10, S_total = 0, 0, 0

  filtered_document=no_copy_pair_guard(text, summary)#no_copy_pair_guard
  # Original text into Sentences
  sentences = sent_tokenize(filtered_document)
  for sentence in sentences:
    for i0 in range(1, M+1):
      masked_sentence,masked_tokens=mask_words(i0, L_min, M, sentence)
      input_base= filler + "." + masked_sentence
      input_help= summary +"."+ masked_sentence
      prediction_base=predict_masked_words(input_base, model, tokenizer)
      prediction_help=predict_masked_words(input_help, model, tokenizer)
      #convert to set for faster lookup
      prediction_base = set(prediction_base)
      prediction_help = set(prediction_help)
      #find the similar tokens
      common_elements_base = prediction_base.intersection(masked_tokens)
      common_elements_help = prediction_help.intersection(masked_tokens)

      S01+=len(common_elements_help)
      S10+=len(common_elements_base)

      S_total+=len(masked_tokens)
  return (S01-S10)/S_total



Test the BLANC score with various summaries given by pretrain-model/tuned-model

In [None]:
text="summarize: The people of the State of California do enact as follows: SECTION 1. Section 836 of the Penal Code is amended to read: 836. (a) A peace officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5 (commencing with Section 830) of Title 3 of Part 2, without a warrant, may arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to be arrested has committed a public offense in the officer’s presence. (2) The person arrested has committed a felony, although not in the officer’s presence. (3) The officer has probable cause to believe that the person to be arrested has committed a felony, whether or not a felony, in fact, has been committed. (b) Any time a peace officer is called out on a domestic violence call, it shall be mandatory that the officer make a good faith effort to inform the victim of his or her right to make a citizen’s arrest, unless the peace officer makes an arrest for a violation of paragraph (1) of subdivision (e) of Section 243 or 273.5. This information shall include advising the victim how to safely execute the arrest. (c) (1) When a peace officer is responding to a call alleging a violation of a domestic violence protective or restraining order issued under Section 527.6 of the Code of Civil Procedure, the Family Code, Section 136.2, 646.91, or paragraph (2) of subdivision (a) of Section 1203.097 of this code, Section 213.5 or 15657.03 of the Welfare and Institutions Code, or of a domestic violence protective or restraining order issued by the court of another state, tribe, or territory and the peace officer has probable cause to believe that the person against whom the order is issued has notice of the order and has committed an act in violation of the order, the officer shall, consistent with subdivision (b) of Section 13701, make a lawful arrest of the person without a warrant and take that person into custody whether or not the violation occurred in the presence of the arresting officer. The officer shall, as soon as possible after the arrest, confirm with the appropriate authorities or the Domestic Violence Protection Order Registry maintained pursuant</s>"

Test with filler has number of '.' = number of tokens in summaries, ie= 64





In [None]:
truncated_reference_summary="Existing law authorizes a peace officer to arrest a person without a warrant if the officer has probable cause to believe that the person has committed a public offense in the officer’s presence or if the officer has probable cause to believe that the person has committed a felony. This</s>"
truncated_reference_summary_score=BLANC_help(truncated_reference_summary,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('TRUNCATED_REFERENCE_SUMMARY score = ',truncated_reference_summary_score)

TRUNCATED_REFERENCE_SUMMARY score =  0.1388888888888889


In [None]:
pretrained_summary="a peace officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5. the officer has probable cause to believe that the person to be arrested has committed a felony, although not in the officer’s presence."
pretrained_summary_score=BLANC_help(pretrained_summary,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('Pretrained_SUMMARY score = ',pretrained_summary_score)

Pretrained_SUMMARY score =  0.16666666666666666


In [None]:
#Test with 2epochs tuned model
trained_summary1="Existing law authorizes a peace officer to arrest a person in obedience to a warrant, warrant, or, pursuant to the authority granted to him or her by Chapter 4.5, without a warrant, to arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to be arrested has committed a public offense"
trained_summary_score1=BLANC_help(trained_summary1,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('2EPOCHS_TRAINED_SUMMARY score = ',trained_summary_score1)

2EPOCHS_TRAINED_SUMMARY score =  0.2175925925925926


In [None]:
trained_summary2="Existing law authorizes a peace officer to arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her, without a warrant to arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to"
trained_summary_score2=BLANC_help(trained_summary2,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('6EPOCHS_TRAINED_SUMMARY score = ',trained_summary_score2)

6EPOCHS_TRAINED_SUMMARY score =  0.1388888888888889


Test the BLANC score with filler that number number of '.' = length of summaries

In [None]:
truncated_reference_summary="Existing law authorizes a peace officer to arrest a person without a warrant if the officer has probable cause to believe that the person has committed a public offense in the officer’s presence or if the officer has probable cause to believe that the person has committed a felony. This</s>"
truncated_reference_summary_score=BLANC_help(truncated_reference_summary,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('TRUNCATED_REFERENCE_SUMMARY score = ',truncated_reference_summary_score)

TRUNCATED_REFERENCE_SUMMARY score =  0.16203703703703703


In [None]:
pretrained_summary="a peace officer may arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her by Chapter 4.5. the officer has probable cause to believe that the person to be arrested has committed a felony, although not in the officer’s presence."
pretrained_summary_score=BLANC_help(pretrained_summary,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('Pretrained_SUMMARY score = ',pretrained_summary_score)

Pretrained_SUMMARY score =  0.14814814814814814


In [None]:
#Test with 2epochs tuned model
trained_summary1="Existing law authorizes a peace officer to arrest a person in obedience to a warrant, warrant, or, pursuant to the authority granted to him or her by Chapter 4.5, without a warrant, to arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to be arrested has committed a public offense"
trained_summary_score1=BLANC_help(trained_summary1,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('2EPOCHS_TRAINED_SUMMARY score = ',trained_summary_score1)

2EPOCHS_TRAINED_SUMMARY score =  0.2175925925925926


In [None]:
trained_summary2="Existing law authorizes a peace officer to arrest a person in obedience to a warrant, warrant or, pursuant to the authority granted to him or her, without a warrant to arrest a person warrant whenever any of the following circumstances occur: (1) The officer has probable cause to believe that the person to"
trained_summary_score2=BLANC_help(trained_summary2,text,model_BLANC, tokenizer_BLANC,M=6,L_min=4)
print('6EPOCHS_TRAINED_SUMMARY score = ',trained_summary_score2)

6EPOCHS_TRAINED_SUMMARY score =  0.18981481481481483
