<a href="https://colab.research.google.com/github/Kthom1/ComparingDecodingMethodsWithPEGASUS/blob/main/ComparingDecodingMethodsWithPEGASUS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
print(torch.__version__)

In [None]:

# Always connect the drive first, so you can save relevant files for later
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
# Check your GPU
!nvidia-smi -L

In [None]:
# Import pytorch and print version, and check that you are in a Runtime with a GPU avaialable
import torch
import os
# This helps with debugging cuda errors; offers stack trace
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
print(torch.__version__)

# If this returns cpu, go to Runtime then Change Runtime type to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
# Huggingface
# Transformers installation
! pip install --upgrade transformers
! pip install datasets

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from rouge import FilesRouge, Rouge

rouge = Rouge()

In [None]:
import datasets
import pandas as pd

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
# To measure lexical diversity
!pip install lexical-diversity
from lexical_diversity import lex_div as ld

In [None]:
cnn_test_from_huggingface = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")
# {
#   "article": "string",
#   "highlights": "string"
# }

In [None]:
cnn_df = pd.DataFrame(cnn_test_from_huggingface)
cnn_df.head()

In [None]:
# Needed for pegasus
!pip install sentencepiece

In [None]:
# Pegasus Large
pegasus_model_large = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large").to(device)
pegasus_tokenizer_large = AutoTokenizer.from_pretrained("google/pegasus-large")

In [None]:
# Pegasus fine-tuned CNN
pegasus_model_cnn_dm = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail").to(device)
pegasus_tokenizer_cnn_dm = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")

In [None]:
def pegasus_text_to_tensor_batch_with_tokenizer(text, relevant_tokenizer):
  batch = relevant_tokenizer.prepare_seq2seq_batch(text, truncation=True, padding='longest', return_tensors='pt').to(device)
  return batch

In [None]:
MAX_NUM_OUTPUT_TOKENS_SMALL = 128;
MAX_NUM_OUTPUT_TOKENS_LARGE = 256

In [None]:
def pegasus_text_to_text_with_model_tokenizer(text, relevant_model, relevant_tokenizer, max_num_output_tokens, num_beams=1, num_beam_groups=1, temperature=1.0, top_k=0, top_p=1.0, repetition_penalty=1.0, diversity_penalty=0.0, no_repeat_ngram_size=0, early_stopping=False, do_sample=False):
  context = pegasus_text_to_tensor_batch_with_tokenizer(text, relevant_tokenizer)
  input_ids = context['input_ids']
  outputs = relevant_model.generate(input_ids, max_length=max_num_output_tokens, num_beams=num_beams, num_beam_groups=num_beam_groups, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, diversity_penalty=diversity_penalty, no_repeat_ngram_size=no_repeat_ngram_size, early_stopping=early_stopping, do_sample=do_sample)
  text = relevant_tokenizer.decode(outputs[0], skip_special_tokens=True)

  return text

In [None]:
CNN_DM_DATASET_NAME = 'CNN-DM'
CNN_DM_TEXT_IDENTIFIER = 'article'
CNN_DM_SUMMARY_IDENTIFIER = 'highlights'

In [None]:
PEGASUS_LARGE_MODEL_NAME = 'PEGASUS-LARGE'
PEGASUS_CNN_DM_MODEL_NAME = 'PEGASUS-CNN-DM'

In [None]:
def write_actual_summaries_to_file(dataframe, summaryIdentifier, dataset_name, num_articles=1):
  actual_summaries = open('{}_dataset-name_{}_num-articles_actual_summaries.txt'.format(dataset_name, num_articles), 'w')

  for row in dataframe.head(num_articles).itertuples():
    actual_summary = getattr(row, summaryIdentifier)
    actual_summaries.write(actual_summary + '\n\n')

In [None]:
# WRITE ACTUAL SUMMARIES
write_actual_summaries_to_file(dataframe=cnn_df, summaryIdentifier=CNN_DM_SUMMARY_IDENTIFIER, dataset_name=CNN_DM_DATASET_NAME, num_articles=100)

In [None]:
NUM_BEAMS = 8
MORE_BEAMS = 16

In [None]:
TOP_K = 40
MORE_K = 640
TEMPERATURE = 0.7

In [None]:
# NUCLEUS SAMPLE
NUCLEUS_SAMPLE_VALUE = 0.95

In [None]:
def generate_all_summaries(dataframe, model, model_name, tokenizer, textIdentifier, dataset_name, num_articles=100):
  approach_name = 'abstractive'

  beam_search_summaries_small = open('{}_dataset-name_BEAM-SEARCH_{}_model-name_{}_approach-name_{}_num-articles_{}_num-beams_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, NUM_BEAMS), 'w')
  beam_search_summaries_large = open('{}_dataset-name_BEAM-SEARCH_{}_model-name_{}_approach-name_{}_num-articles_{}_num-beams_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, MORE_BEAMS), 'w')

  top_k_summaries_small = open('{}_dataset-name_TOP-K_{}_model-name_{}_approach-name_{}_num-articles_{}_top-k_{}_do-sample_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, TOP_K, True), 'w')
  top_k_summaries_large = open('{}_dataset-name_TOP-K_{}_model-name_{}_approach-name_{}_num-articles_{}_top-k_{}_do-sample_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, MORE_K, True), 'w')

  top_k_temperature_summaries_small = open('{}_dataset-name_TOP-K-TEMPERATURE_{}_model-name_{}_approach-name_{}_top-k_{}_temperature_{}_do-sample_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, TOP_K, TEMPERATURE, True), 'w')
  top_k_temperature_summaries_large = open('{}_dataset-name_TOP-K-TEMPERATURE_{}_model-name_{}_approach-name_{}_top-k_{}_temperature_{}_do-sample_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, MORE_K, TEMPERATURE, True), 'w')

  top_p_summaries = open('{}_dataset-name_TOP-P_{}_model-name_{}_approach-name_{}_num-articles_{}_top-p_{}_do-sample_generated_summaries.txt'.format(dataset_name, model_name, approach_name, num_articles, NUCLEUS_SAMPLE_VALUE, True), 'w')

  for row in dataframe.head(num_articles).itertuples():
    input_text = getattr(row, textIdentifier)
    final_input_text = input_text

    # BEAM SMALL 
    beam_search_summary_small = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, num_beams=NUM_BEAMS)
    beam_search_summaries_small.write(beam_search_summary_small + '\n\n')
    # BEAM LARGE 
    beam_search_summary_large = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, num_beams=MORE_BEAMS)
    beam_search_summaries_large.write(beam_search_summary_large + '\n\n')
    # TOP K SMALL
    top_k_summary_small = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, top_k=TOP_K, do_sample=True)
    top_k_summaries_small.write(top_k_summary_small + '\n\n')
    # TOP K LARGE 
    top_k_summary_large = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, top_k=MORE_K, do_sample=True)
    top_k_summaries_large.write(top_k_summary_large + '\n\n')
    # TOP K TEMPERATURE SMALL
    top_k_temperature_summary_small = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, top_k=TOP_K, temperature=TEMPERATURE, do_sample=True)
    top_k_temperature_summaries_small.write(top_k_temperature_summary_small + '\n\n')
    # TOP K TEMPERATURE LARGE 
    top_k_temperature_summary_large = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, top_k=MORE_K, temperature=TEMPERATURE, do_sample=True)
    top_k_temperature_summaries_large.write(top_k_temperature_summary_large + '\n\n')
    # TOP P
    top_p_summary = pegasus_text_to_text_with_model_tokenizer(text=final_input_text, relevant_model=model, relevant_tokenizer=tokenizer, max_num_output_tokens=MAX_NUM_OUTPUT_TOKENS_SMALL, top_p=NUCLEUS_SAMPLE_VALUE, do_sample=True)
    top_p_summaries.write(top_p_summary + '\n\n')

In [None]:
# CNN ALL  LARGE
generate_all_summaries(cnn_df, model=pegasus_model_large, model_name=PEGASUS_LARGE_MODEL_NAME, tokenizer=pegasus_tokenizer_large, textIdentifier=CNN_DM_TEXT_IDENTIFIER, dataset_name=CNN_DM_DATASET_NAME)

In [None]:
# CNN ALL ABSTRACTIVE CNN FINE-TUNED
generate_all_summaries(cnn_df, model=pegasus_model_cnn_dm, model_name=PEGASUS_CNN_DM_MODEL_NAME, tokenizer=pegasus_tokenizer_cnn_dm, textIdentifier=CNN_DM_TEXT_IDENTIFIER, dataset_name=CNN_DM_DATASET_NAME)

In [None]:
import os

with open(cnn_actual_summaries_file_path, 'r') as cnn_actual_summaries_file:
  cnn_actual_summaries_data = cnn_actual_summaries_file.read()

  flt = ld.flemmatize(cnn_actual_summaries_data)

  mtld = ld.mtld(flt)

  print("LD - MTLD")

  print(mtld)

In [None]:
import os

def loop_directory_and_write_scores(directory, actual_summaries_file_path, dataset_name):
  print(dataset_name)

  with open(actual_summaries_file_path, 'r') as actual_summaries_file:
    actual_summaries_data = actual_summaries_file.read()

  for entry in os.scandir(directory):
    if (entry.path.endswith('.txt')):
      generated_summaries_file_path = entry.path

      with open(generated_summaries_file_path, 'r') as generated_summaries_file:
        generated_summaries_data = generated_summaries_file.read()

      
      rouge_scores_avg = rouge.get_scores(generated_summaries_data, actual_summaries_data, avg=True)

      flt = ld.flemmatize(generated_summaries_data)

      mtld = ld.mtld(flt)

      hdd = ld.hdd(flt)

      print("PATH")

      print(entry.path)


      print("ROUGE SCORE AVG")

      print(rouge_scores_avg)

      print("LD - MTLD")

      print(mtld)

      print("LD - HDD")

      print(hdd, end='\n\n')

In [None]:
# Get scores
loop_directory_and_write_scores(directory='./DIRECTORY_WITH_ALL_YOUR_GENERATED SUMMARIES', actual_summaries_file_path='./CNN-DM_dataset-name_100_num-articles_actual_summaries.txt', dataset_name=CNN_DM_DATASET_NAME)