In [None]:
!pip install torch
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install rouge_score
!pip install --upgrade bert-extractive-summarizer

In [None]:
import torch, json
from summarizer import TransformerSummarizer, Summarizer
from datasets import load_dataset, load_metric
from transformers import *
# from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
df=pd.read_csv("/content/data/summarizationdataset.csv")
df=df[['Paragraph', 'Simplification']].dropna()
print("Data size:", len(df))
# df.head()

In [None]:
metric = load_metric("rouge")

In [None]:
def get_t5_summary(model, tokenizer, paragraph):
  text = "".join(paragraph)
  Preprocessed_text = "summarize: " + text
  tokens_input = tokenizer.encode(Preprocessed_text,return_tensors="pt", max_length=512, truncation=True)
  summary_ids = model.generate(tokens_input, min_length=60, max_length=180, length_penalty=4.0)
  return tokenizer.decode(summary_ids[0])

In [None]:
def get_pegasus_summary(model, tokenizer, src_text):
  batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
  translated = model.generate(**batch, min_length=60, max_length=180, length_penalty=4.0)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text[0]

In [None]:
BERT_model = Summarizer()

In [None]:
BART_model = pipeline('summarization', model='facebook/bart-large-cnn', tokenizer='facebook/bart-large-cnn')

In [None]:
T5_model = T5ForConditionalGeneration.from_pretrained('t5-base')
T5_tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
# Load model, model config and tokenizer via Transformers
custom_config = AutoConfig.from_pretrained('roberta-base')
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
custom_model = AutoModel.from_pretrained('roberta-base', config=custom_config)
RoBERTa_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
PEGASUS_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
PEGASUS_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)

In [None]:
bert5g_model_path = '/content/gdrive/MyDrive/Security/model-training/pretrained-bert/checkpoint-37000/'
bert5g_tokenizer_path = '/content/gdrive/MyDrive/Security/model-training/pretrained-bert/'

In [None]:
custom_config = AutoConfig.from_pretrained(bert5g_model_path)
custom_config.output_hidden_states = True
custom_tokenizer = AutoTokenizer.from_pretrained(bert5g_tokenizer_path)
custom_model = AutoModel.from_pretrained(bert5g_model_path, config=custom_config)
BERT5G_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

In [None]:
XLNET_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2")
GPT2_model_medium = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")
GPT2_model_large = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-large")
GPT2_model_distilgpt2 = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="distilgpt2")

In [None]:
BERT_model_preds = []
BERT5G_model_preds = []
PEGASUS_model_preds = []
RoBERTa_model_preds = []
BART_model_preds = []
XLNET_model_preds = []
GPT2_model_preds = []
GPT2_model_medium_preds = []
GPT2_model_large_preds = []
GPT2_model_distilgpt2_preds = []
T5_model_preds = []
true_labels = []

In [None]:
for i in range(len(df)):
  try:
    paragraph = df['Paragraph'][i]
    BERT_model_pred = ''.join(BERT_model(paragraph, min_length=60))
    BERT5G_model_pred = ''.join(BERT5G_model(paragraph))
    RoBERTa_model_pred = ''.join(RoBERTa_model(paragraph))
    PEGASUS_model_pred = ''.join(get_pegasus_summary(PEGASUS_model, PEGASUS_tokenizer, paragraph))
    BART_model_pred = ''.join(BART_model(paragraph, min_length = round(0.1 * len(paragraph.split(' '))), max_length = round(0.2 * len(paragraph.split(' '))), do_sample=False)[0]['summary_text'])
    XLNET_model_pred = ''.join(XLNET_model(paragraph, min_length=60))
    GPT2_model_pred = ''.join(GPT2_model(paragraph, min_length=60))
    GPT2_model_medium_pred = ''.join(GPT2_model_medium(paragraph, min_length=60))
    GPT2_model_distilgpt2_pred = ''.join(GPT2_model_distilgpt2(paragraph, min_length=60))
    GPT2_model_large_pred = ''.join(GPT2_model_large(paragraph, min_length=60))
    T5_model_pred = get_t5_summary(T5_model, T5_tokenizer, paragraph)
  except Exception as e:
    print(e)
    continue
  true_labels.append(df['Simplification'][i])
  BERT_model_preds.append(BERT_model_pred)
  BERT5G_model_preds.append(BERT5G_model_pred)
  PEGASUS_model_preds.append(PEGASUS_model_pred)
  RoBERTa_model_preds.append(RoBERTa_model_pred)
  BART_model_preds.append(BART_model_pred)
  XLNET_model_preds.append(XLNET_model_pred)
  GPT2_model_preds.append(GPT2_model_pred)
  GPT2_model_medium_preds.append(GPT2_model_medium_pred)
  GPT2_model_distilgpt2_preds.append(GPT2_model_distilgpt2_pred)
  GPT2_model_large_preds.append(GPT2_model_large_pred)
  T5_model_preds.append(T5_model_pred)

In [None]:
perf_dic = {}
perf_dic['BERT_model'] = metric.compute(predictions=BERT_model_preds, references=true_labels)
perf_dic['BERT5G_model'] = metric.compute(predictions=BERT5G_model_preds, references=true_labels)
perf_dic['PEGASUS_model'] = metric.compute(predictions=PEGASUS_model_preds, references=true_labels)
perf_dic['RoBERTa_model'] = metric.compute(predictions=RoBERTa_model_preds, references=true_labels)
perf_dic['BART_model'] = metric.compute(predictions=BART_model_preds, references=true_labels)
perf_dic['XLNET_model'] = metric.compute(predictions=XLNET_model_preds, references=true_labels)
perf_dic['GPT2_model'] = metric.compute(predictions=GPT2_model_preds, references=true_labels)
perf_dic['GPT2_model_medium'] = metric.compute(predictions=GPT2_model_medium_preds, references=true_labels)
perf_dic['GPT2_model_large'] = metric.compute(predictions=GPT2_model_large_preds, references=true_labels)
perf_dic['GPT2_model_distilgpt2'] = metric.compute(predictions=GPT2_model_distilgpt2_preds, references=true_labels)
perf_dic['T5_model'] = metric.compute(predictions=T5_model_preds, references=true_labels)

In [None]:
with open('/content/gdrive/MyDrive/Security/data/PerformanceCompare.txt', 'w') as convert_file:
     convert_file.write(json.dumps(perf_dic))