In [None]:
# Transformers
from transformers import BartTokenizer, BartForConditionalGeneration            # BERT Tokenizer and architecture
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments               # fine-tune model
from transformers import pipeline                                               # pipeline
from transformers import DataCollatorForSeq2Seq                                 # DataCollator to batch the data
import torch                                                                    # PyTorch
import evaluate                                                                 # Hugging Face's library for model evaluation
from datasets import Dataset

import numpy as np
import pandas as pd
import nltk
# from textblob import TextBlob                                                 # fix spelling mistakes in texts
from sklearn.feature_extraction.text import TfidfVectorizer                     # identify the most common terms in the corpus
import re                                                                       # clean text data
nltk.download('punkt')                                                          # divides a text into a list of sentences

In [None]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available - Using GPU")
    device = torch.device('cuda')
else:
    print("GPU is not available - Using CPU")
    device = torch.device('cpu')

In [6]:
# allow to fully read the dialogues and its summary 
pd.set_option('display.max_colwidth', 1000)

In [7]:
def display_feature_list(features, feature_type):

    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')

def describe_df(df):
    
    global categorical_features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} samples")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')

    display_feature_list(categorical_features, 'Categorical')

    print(f'\n{type(df).__name__} Head: \n')
    display(df.head(5))
    print(f'\n{type(df).__name__} Tail: \n')
    display(df.tail(5))

#### Exploring the Dataset

In [8]:
# Loading data
train_df = pd.read_csv('Datasets/samsum/samsum-train.csv')
test_df = pd.read_csv('Datasets/samsum/samsum-test.csv')
val_df = pd.read_csv('Datasets/samsum/samsum-validation.csv')

Training Dataset

In [None]:
# Extracting info on the training Dataframe
describe_df(train_df)

In [None]:
# one of the dialogues is empty
# null dialogues
NullRaws = train_df['dialogue'].isnull()
filtered_train = train_df[NullRaws] 
filtered_train 

In [11]:
# removing all null values
train_df = train_df.dropna() 

In [12]:
# Removing 'Id' from categorical features list
categorical_features.remove('id')

Test Dataset

In [None]:
# Extracting info on the test dataset
describe_df(test_df)

In [14]:
# Removing 'Id' from categorical features list
categorical_features.remove('id')

Validation Dataset

In [None]:
# Extracting info on the val dataset
describe_df(val_df)

In [16]:
# Removing 'Id' from categorical features list
categorical_features.remove('id')

#### Preprocessing Data

In [None]:
# example text that contain tags
print(test_df['dialogue'].iloc[0])

In [None]:
# example text that contain tags
print(train_df['dialogue'].iloc[14727])

In [19]:
# function for clean tags and empty lines
def clean_tags(text):
    clean = re.compile('<.*?>')
    clean = re.sub(clean, '', text) #Replacing tags text by an empty string

    clean = '\n'.join([line for line in clean.split('\n') if not re.match(r'.*:\s*$',line)]) # Removing empty dialogues

    return clean

In [None]:
# example for clean tags
test1 = clean_tags(train_df['dialogue'].iloc[14727]) # Applying function to example text

print(test1)

In [21]:
# Apply clean tags function for entire dataset
def clean_df(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(clean_tags)
    return df

# Cleaning texts in all datasets
train_df = clean_df(train_df,['dialogue', 'summary'])
test_df = clean_df(test_df,['dialogue', 'summary'])
val_df = clean_df(val_df,['dialogue', 'summary'])

In [None]:
train_df.tail(3)

In [None]:
# Transforming dataframes into datasets
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
val_ds = Dataset.from_pandas(val_df)

# Visualizing results
print(train_ds)
print('\n' * 2)
print(test_ds)
print('\n' * 2)
print(val_ds)

In [None]:
train_ds[0]

#### Modeling

In [None]:
# Loading summarization pipeline with the bart-large-cnn model
summarizer = pipeline('summarization', model = 'facebook/bart-large-xsum')

In [None]:
text = """Hugging Face's transformers library provides many tools for modern NLP tasks.
          With pipelines, you can easily load models and perform complex tasks like
          text generation, summarization, or translation with minimal code."""

news = '''Bobi, the world’s oldest dog ever, has died after reaching the almost inconceivable age of 31 years and 165 days, said Guinness World Records (GWR) on Monday.
His death at an animal hospital on Friday was initially announced by veterinarian Dr. Karen Becker.
She wrote on Facebook that “despite outliving every dog in history, his 11,478 days on earth would never be enough, for those who loved him.”
There were many secrets to Bobi’s extraordinary old age, his owner Leonel Costa told GWR in February. He always roamed freely, without a leash or chain, lived in a “calm, peaceful” environment and ate human food soaked in water to remove seasonings, Costa said.
He spent his whole life in Conqueiros, a small Portuguese village about 150 kilometers (93 miles) north of the capital Lisbon, often wandering around with cats.
Bobi was a purebred Rafeiro do Alentejo – a breed of livestock guardian dog – according to his owner. Rafeiro do Alentejos have a life expectancy of about 12-14 years, according to the American Kennel Club.
But Bobi lived more than twice as long as that life expectancy, surpassing an almost century-old record to become the oldest living dog and the oldest dog ever – a title which had previously been held by Australian cattle-dog Bluey, who was born in 1910 and lived to be 29 years and five months old.
However, Bobi’s story almost had a different ending.
When he and his three siblings were born in the family’s woodshed, Costa’s father decided they already had too many animals at home.
Costa and his brothers thought their parents had taken all the puppies away to be destroyed. However, a few sad days later, they found Bobi alive, safely hidden in a pile of logs.
The children hid the puppy from their parents and, by the time Bobi’s existence became known, he was too old to be put down and went on to live his record-breaking life.
His 31st birthday party in May was attended by more than 100 people and a performing dance troupe, GWR said.
His eyesight deteriorated and walking became harder as Bobi grew older but he still spent time in the backyard with the cats, rested more and napped by the fire.
“Bobi is special because looking at him is like remembering the people who were part of our family and unfortunately are no longer here, like my father, my brother, or my grandparents who have already left this world,” Costa told GWR in May. “Bobi represents those generations.”
'''

chat = '''
Theresa: Hey Louise, how are u?
Theresa: This is my workplace, they always give us so much food here 😊
Theresa: Luckily they also offer us yoga classes, so all the food isn't much of a problem 😂
Louise: Hey!! 🙂 
Louise: Wow, that's awesome, seems great 😎 Haha
Louise: I'm good! Are you coming to visit Stockholm this summer? 🙂
Theresa: I don't think so :/ I need to prepare for Uni.. I will probably attend a few lessons this winter
Louise: Nice! Do you already know which classes you will attend?
Theresa: Yes, it will be psychology :) I want to complete a few modules that I missed :)
Louise: Very good! Is it at the Uni in Prague?
Theresa: No, it will be in my home town :)
Louise: I have so much work right now, but I will continue to work until the end of summer, then I'm also back to Uni, on the 26th September!
Theresa: You must send me some pictures, so I can see where you live :) 
Louise: I will, and of my cat and dog too 🤗
Theresa: Yeeeesss pls :)))
Louise: 👌👌
Theresa: 🐱💕'''


summary1 = summarizer(text)
summary2 = summarizer(news)
summary3 = summarizer(chat)

print(summary1)
print("\n"*2)
print(summary2)
print("\n"*2)
print(summary3)

As above result model is able to accurately produce a much shorter text. However its not capabel to summarise Dialogs accurately

In [28]:
model_name = 'facebook/bart-large-xsum'                  # model name
tokenizer = BartTokenizer.from_pretrained(model_name)    # Loading Tokanizer

model = BartForConditionalGeneration.from_pretrained(model_name)

In [25]:
# print(model) # Visualizing model's architecture

In [27]:
# function for preprocess datasets
def preprocess_function(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Applying preprocess_function to the datasets
tokenized_train = train_ds.map(preprocess_function, batched=True,
                               remove_columns=['id', 'dialogue', 'summary', '__index_level_0__']) # Removing features

tokenized_test = test_ds.map(preprocess_function, batched=True,
                               remove_columns=['id', 'dialogue', 'summary']) # Removing features

tokenized_val = val_ds.map(preprocess_function, batched=True,
                               remove_columns=['id', 'dialogue', 'summary']) # Removing features

# Printing results
print('\n' * 3)
print('Preprocessed Training Dataset:\n')
print(tokenized_train)
print('\n' * 2)
print('Preprocessed Test Dataset:\n')
print(tokenized_test)
print('\n' * 2)
print('Preprocessed Validation Dataset:\n')
print(tokenized_val)

In [None]:
# Selecting a sample from the dataset
sample = tokenized_train[0]

# Printing its features
print("input_ids:")                    # These are the token IDs mapped to the dialogues
print(sample['input_ids'])
print("\n")
print("attention_mask:")               # indicates which tokens the model should pay attention to and which tokens should be ignored
print(sample['attention_mask'])
print("\n")
print("sample:")                       # token IDs obtained from the words and subwords in the summaries
print(sample['labels'])
print("\n")

In [31]:
# Instantiating Data Collator to batch the data
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [32]:
# Loading ROUGE Score
metric = evaluate.load('rouge') 

In [33]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred# Obtaining predictions and true labels
    
    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Obtaining the true labels tokens, while eliminating any possible masked token (i.e., label = -100)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    
    # Computing rouge score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # Extracting some results

    # Add mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Defining parameters for training
training_args = Seq2SeqTrainingArguments(
    output_dir = 'Bart_Samsum',
    evaluation_strategy = "steps",
    eval_steps = 500,
    save_strategy = 'steps',
    save_steps=500,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    seed = 42,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

In [None]:
# Defining Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# Training model
trainer.train() 

#### Evaluating and Saving Model

In [None]:
# Evaluating model performance on the tokenized validation dataset
validation = trainer.evaluate(eval_dataset = tokenized_val)

print(validation) 

In [None]:
# Saving model 
directory = "BART_Finetuned"
trainer.save_model(directory)

# Saving model tokenizer
tokenizer.save_pretrained(directory)

In [None]:
# Loading summarization pipeline and model

# summarizer  = pipeline("text2text-generation", model="MaleeshaAlu/Bart_Samsum")

examples from the validation dataset

In [None]:
# random example from the validation dataset
# val_ds[35]

In [None]:
# dialogue = "John: doing anything special?\r\nAlex: watching 'Millionaires' on tvn\r\nSam: me too! He has a chance to win a million!\r\nJohn: ok, fingers crossed then! :)"
# summary = 'Alex and Sam are watching Millionaires.'

# generated_summary = summarizer(dialogue)

In [None]:
# print('Original Dialogue:\n')
# print(dialogue)
# print('\n' * 2)
# print('Reference Summary:\n')
# print(summary)
# print('\n' * 2)
# print('Model-generated Summary:\n')
# print(generated_summary)