# Project: Text Summarisation
## Installing the required libraries 
First we will install all the required libraries that we are going to use in building our __Neural Network__.


In [None]:
!pip install -U transformers
!pip install -U accelerate 
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install -U sentencepiece
!pip install -U urllib3
!pip install py7zr

## Loading the dataset
Here we will now load the Article data set into our code to be done summarisation on. The data set used here is the __CNN daily mail__ data set in which we have 312k rows of articles published on the daily basis.

In [None]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
dataset

In [None]:
dataset['train'][1]['article'][:350]

In [None]:
dataset['train'][1]['highlights']

## Selecting Transformer
Here at this step, we will see which of the four;
1. gpt2-medium
2. t5-base
3. facebook/bart-large-cnn
4. google/pegasus-cnn_dailymail

Works best in generating summaries in order to be transformed into a model for Text Summarisation.

In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model = "gpt2-medium")

In [None]:
dataset['train'][1]['article'][:2000]
input_text = dataset['train'][1]['article'][:2000]
query = input_text + "\nTL; DR:\n"
pipe_out = pipe(query, max_length = 512, clean_up_tokenization_spaces = True)

In [None]:
pipe_out[0]['generated_text'][len(query):]

In [None]:
summaries = {}
summaries['gpt2-medium-380M'] = pipe_out[0]['generated_text'][len(query):]

In [None]:
# Try out T5 transformers
pipe = pipeline('summarization', model = 't5-base')
pipe_out = pipe(input_text)
summaries['t5-base-223M'] = pipe_out[0]['summary_text']


In [None]:
pipe = pipeline('summarization', model = 'facebook/bart-large-cnn')
pipe_out = pipe(input_text)
summaries['bart-large-cnn-400M'] = pipe_out[0]['summary_text']

In [None]:
pipe = pipeline('summarization', model = 'google/pegasus-cnn_dailymail')
pipe_out = pipe(input_text)
summaries['pegasus-cnn-568M'] = pipe_out[0]['summary_text']


In [None]:
for model in summaries:
    print(model.upper())
    print(summaries[model])
    print("")

Since the BART dataset is giving much more cleare and accurate results so we will move ahead with this model for our text summarization problem for a conversational data set.

## Importing basic libaries 
Importing the basic libraries for building the Text Summarisation model.

In [None]:
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

## Setting the device
Here we will set the device configuration for data processing, such as base model to be used, tokenizing the data, gpu setting for batch processing and faster rendering of the code.

In [None]:
device = 'gpu'
model_ckpt = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

## Loading the data
Here we will use the __Samsum__ data set from __Hugging Face__ which consists of __Dialogues__ and __Summary__ of the respective dialogues held in a conversation over the text chat.

In [None]:
samsun = load_dataset('samsum')
samsun

In [None]:
samsun['train'][0]

## Visual Representation
Plotting the histogram to check the maximum length of the __Dialogues__ and the __Summaries__

In [None]:
dialogue_len  = [len(x['dialogue'].split()) for x in samsun['train']]
summary_len =  [len(x['summary'].split()) for x in samsun['train']]

In [None]:
import pandas as pd
data = pd.DataFrame([dialogue_len, summary_len]).T
data.columns = ['Dialogue Length', 'Summary Length']

data

data.hist(figsize = (15,5))

Here we see that the maximum length of a __Dialogue__ is not more than __500__ and the maximum length of the generated summary is also less than __70__ words.

In [None]:
# lets build the DATA COLLATOR
def get_feature(batch):
    encodings = tokenizer(batch['dialogue'], text_target = batch['summary'], max_length = 1024, truncation = True)
    encodings = {'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': encodings['labels']}
    return encodings

In [None]:
samsun_pt = samsun.map(get_feature, batched = True)

In [None]:
samsun_pt

In [None]:
columns = ['input_ids', 'labels', 'attention_mask']
samsun_pt.set_format(type = 'torch', columns = columns)

In [None]:
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
training_args = TrainingArguments(
    output_dir = 'bart_samsum',
    num_train_epochs = 1,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    weight_decay = 0.01,
    logging_steps = 10,
    eval_strategy = 'steps',
    eval_steps = 500,
    save_steps = 1e6,
    gradient_accumulation_steps = 16
)
trainer = Trainer(model = model, args = training_args, processing_class= = tokenizer, data_collator = data_collator, train_dataset = samsun_pt['train'], eval_dataset = samsun_pt['validation'])

In [None]:
trainer.train() 

## Saving the model
Now since we trained the model on the provided data, so in order to use the same model to perform the same task of Text Summarisation we need not to run the whole code again, we just have to save the model. Giving a custom name to the saved model.

In [None]:
trainer.save_model("Text_summarization_2ndProject")

## Testing the model on the new data
Now since we have made a machine learning model that summarizes text, so in order to see its flexibility we will run it on the new data and test its validity that if it is working with the same precision as for the previous data or not.

In [None]:
#custom Dialogue Prediction

pipe = pipeline('summarization', model = 'Text_summarization_2ndProject')
gen_kwargs = {'length_penalty': 0.8, 'num_beams': 8, 'max_length': 128}

custom_dialogue = """“Look what Eddie gave me,” said Cindy, all friendly. She pulled a pink teddy bear out of her purse and squeezed its belly. It sang “You Are My Sunshine” in a vibrating robot voice. “That’s nice,” said Jasmine, her voice so high that she sounded almost like the teddy bear. Cindy smiled and walked off with Eddie, swinging her hips back and forth.
"""

print(pipe(custom_dialogue, **gen_kwargs))

Finally, we can see the generated summary of the custom dialogue created, hence we can conclude that our Text Summarization model is working fine with new data as well.