### Mounting the google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


pip installing rouge score for model evaluation

In [2]:
pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=e606e45c7dd60602a18780623715f68f0d13c430445157b658283bf61638d769
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


### Importing required libraries

In [25]:
import pandas as pd
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import tensorflow as tf
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

In [55]:
# Loading datasets
train_df = pd.read_csv('drive/MyDrive/nlp_data/test.csv')
test_df = pd.read_csv('drive/MyDrive/nlp_data/validation.csv')

In [57]:
train_df.head(2)

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...


In [58]:
test_df.head(2)

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...


In [59]:
# shape of train and test
train_df.shape, test_df.shape

((11490, 3), (13368, 3))

In [60]:
# checking if we have GPU available for model training
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [61]:
# Preparing text and summaries for model training
# highlights are the summaries/target
# articles are the original articles
train_texts = ['summarize: ' + text.lower() for text in train_df['article']]
train_summaries = [summary.lower() for summary in train_df['highlights']]

test_texts = ['summarize: ' + text.lower() for text in test_df['article']]
test_summaries = [summary.lower() for summary in test_df['highlights']]

In [62]:
train_texts[0]

"summarize: ever noticed how plane seats appear to be getting smaller and smaller? with increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. they say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. more than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? this week, a u.s consumer advisory group set up by the department of transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'in a world where animals have more rights to space and food than humans,' said charlie leocha, consumer representative on the committee.\xa0'it is time that the dot and faa take a stand for humane treatment of passengers.' but could crowding on planes lead to more serious issues than figh

In [63]:
train_summaries[0]

'experts question if  packed out planes are putting passengers at risk .\nu.s consumer advisory group says minimum space must be stipulated .\nsafety tests conducted on planes with more leg room than airlines offer .'

In [64]:
# Initializing the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [65]:
# Encodingthe train and test set with truncation and padding
train_encodings = tokenizer(train_texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
train_labels = tokenizer(train_summaries, return_tensors='tf', padding=True, truncation=True, max_length=150)

test_encodings = tokenizer(test_texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
test_labels = tokenizer(test_summaries, return_tensors='tf', padding=True, truncation=True, max_length=150)

In [66]:
train_encodings['input_ids'][0]

<tf.Tensor: shape=(512,), dtype=int32, numpy=
array([21603,    10,   664,  4944,   149,  6112,  6116,  2385,    12,
          36,   652,  2755,    11,  2755,    58,    28,  3094,  2302,
          13,   151,   838,    12,     8, 22902,     6,   128,  2273,
          33,   822,    53,     3,    99,   578,   224,  7614,    91,
        6112,     7,    19,     3,  3131,  9234,    44,  1020,     5,
          79,   497,    24,     8, 18508,    53,   628,    30, 15726,
        3767,    15,     7,    19,    59,   163, 14209,     3,    18,
          34,    31,     7,     3,  3131,    69,   533,    11,  1455,
          16,  5129,     5,    72,   145,     3,     7,  4960,   115,
        7428,   147,     8,  2939,   880,     6, 18508,    53,   628,
          30,  6112,     7,     3,  3131,    69,   533,    11,  1455,
          16,  5129,    58,    48,   471,     6,     3,     9,     3,
          76,     5,     7,  3733, 18599,   563,   356,    95,    57,
           8,  3066,    13,  5127,   243,   

In [67]:
train_labels['input_ids'][0]

<tf.Tensor: shape=(150,), dtype=int32, numpy=
array([ 2273,   822,     3,    99,  7614,    91,  6112,     7,    33,
           3,  3131,  9234,    44,  1020,     3,     5,     3,    76,
           5,     7,  3733, 18599,   563,   845,  2559,   628,   398,
          36, 28713,    26,     3,     5,  1455,  3830,  4468,    30,
        6112,     7,    28,    72,  4553,   562,   145, 19184,   462,
           3,     5,     1,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [68]:
train_labels['input_ids'][1]

<tf.Tensor: shape=(150,), dtype=int32, numpy=
array([18364, 21615,  4940,     3, 23626,   139,     3,  7325, 22104,
          44,     3,   172,    32,    32,    16,  4653, 18222,     3,
           5,     3,    52,     9,   107,    83,     3,  2729,  1635,
           6, 12864,  4037,  1587,  3127, 14314,    53,     3,    31,
         235,  1135,     3,    23,  5781,     3,     9,     3,  7325,
          55,    31,     3, 29178,     3,    88,  4728,   139,     3,
           9,  2288,   144,   274,  7232,     3,  7325,     7,    11,
          47,     3, 28540,     3,     5,     1,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [69]:
# Adding decoder_input_ids by shifting labels and adding start token to the summaries/Target
def add_decoder_inputs(labels):
    labels_shifted = tf.concat([tf.fill([labels.shape[0], 1], tokenizer.pad_token_id), labels[:, :-1]], axis=-1)
    return labels_shifted

train_decoder_input_ids = add_decoder_inputs(train_labels['input_ids'])
test_decoder_input_ids = add_decoder_inputs(test_labels['input_ids'])

In [70]:
train_decoder_input_ids[0]

<tf.Tensor: shape=(150,), dtype=int32, numpy=
array([    0,  2273,   822,     3,    99,  7614,    91,  6112,     7,
          33,     3,  3131,  9234,    44,  1020,     3,     5,     3,
          76,     5,     7,  3733, 18599,   563,   845,  2559,   628,
         398,    36, 28713,    26,     3,     5,  1455,  3830,  4468,
          30,  6112,     7,    28,    72,  4553,   562,   145, 19184,
         462,     3,     5,     1,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [71]:
train_decoder_input_ids[1]

<tf.Tensor: shape=(150,), dtype=int32, numpy=
array([    0, 18364, 21615,  4940,     3, 23626,   139,     3,  7325,
       22104,    44,     3,   172,    32,    32,    16,  4653, 18222,
           3,     5,     3,    52,     9,   107,    83,     3,  2729,
        1635,     6, 12864,  4037,  1587,  3127, 14314,    53,     3,
          31,   235,  1135,     3,    23,  5781,     3,     9,     3,
        7325,    55,    31,     3, 29178,     3,    88,  4728,   139,
           3,     9,  2288,   144,   274,  7232,     3,  7325,     7,
          11,    47,     3, 28540,     3,     5,     1,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [72]:
# Creating TensorFlow datasets
def create_tf_dataset(encodings, labels, decoder_input_ids, batch_size=8):
    dataset = tf.data.Dataset.from_tensor_slices((
        {**encodings, 'decoder_input_ids': decoder_input_ids},
        labels
    ))
    
    return dataset.shuffle(len(encodings['input_ids'])).batch(batch_size)

train_dataset = create_tf_dataset(train_encodings, train_labels['input_ids'], train_decoder_input_ids, batch_size=8)
test_dataset = create_tf_dataset(test_encodings, test_labels['input_ids'], test_decoder_input_ids, batch_size=8)

In [73]:
train_dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'decoder_input_ids': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 150), dtype=tf.int32, name=None))>

In [74]:
test_dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'decoder_input_ids': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 150), dtype=tf.int32, name=None))>

In [75]:
# Setting up the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# Defining custom loss function
def masked_sparse_categorical_crossentropy(y_true, y_pred):
    # Defining mask to ignore padding tokens
    mask = tf.math.logical_not(tf.math.equal(y_true, tokenizer.pad_token_id))
    # Calculating loss
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    # Applying mask
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    # Taking average over non-padded tokens
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

# Compiling the model with custom loss function
model.compile(optimizer=optimizer, loss=masked_sparse_categorical_crossentropy, metrics=['accuracy'])

# Fine Tuning the model
history = model.fit(
    train_dataset,
    epochs=3,
    validation_data=test_dataset
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [76]:
# Evaluating the model
results = model.evaluate(test_dataset)
print(f"Test loss: {results}")

Test loss: [1.6137797832489014, 0.397405743598938]


In [None]:
# Summarization function
# It converts the article/input text into the required format for prediction/summarization.
def summarize(text):
    input_text = 'summarize: ' + text.lower()
    input_ids = tokenizer.encode(input_text, return_tensors='tf', max_length=512, truncation=True)
    summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [77]:
# Making prediction on an article
text_to_summarize = """Ronda Rousey recorded the fastest-ever finish in a UFC title fight as she submitted Cat Zingano after just 14 seconds in Los Angeles. Rousey was expected to face the toughest examination of her reign as bantamweight champion against the unbeaten Zingano. But having avoided a flying knee in the opening seconds, Rousey took her opponent down and set to work trying to execute her trademark armbar. Scroll down to watch Rousey beat Zingano in 14 seconds .
Ronda Rousey manoeuvres herself into position to submit Cat Zingano after 14 seconds of their fight .
Rousey attempts to lock in her trademark arm bar finish as she defended her bantamweight title .
Rousey consoles Zingano after her stunning victory inside 14 seconds at the Staples Center in Los Angeles .
Rousey grapples with Zingano before celebrating in the Octagon after her record-breaking victory .
Ronda Rousey bt Cat Zingano via sub . Holly Holm bt Raquel Pennington via SD .
Jake Ellenberger bt Josh Koscheck via sub . Alan Jouban bt Richard Walsh via KO .
Tony Ferguson bt Gleison Tibau via sub . Roan Carneiro bt Mark Munoz via sub .
Roman Salazar bt Norifumi Yamamoto N/C . Tim Means bt Dhiego Lima via TKO .
Derrick Lewis bt Ruan Potts via TKO . Valmir Lazaro bt James Krause via SD .
Masio Fullen bt Alexander Torres via SD .
Rousey had landed on her head but the champion gracefully flipped Zingano on to her back,
got up and manoeuvred swiftly into position to wrench Zingano's arm grotesquely.
Rousey forced the challenger to tap out.
'We were expecting that she might come out and do something flying at me right away,
' Rousey said. 'That's not usually how you land an armbar at that angle, but it works.
It was a lot like judo transitions, where you scramble the second you hit the ground.
'I made that up on the fly, to be honest. But it was kind of funny: We were going toward the ground,
and I kind of reverted back to judo mode and was thinking, 'Don't touch your back. It's a point.'
'That's where the acrobatic thing came from, was thinking about not touching your back in judo.'
 It was hard to work out who was more stunned, Zingano or the sell-out 17,000-crowd at the Staples Center.
 'She's really good ... but that wouldn't happen again,' the beaten challenger said.
 'It was a knee and then a throw and then a scramble, and then she was wrapped around my arm.
 I got caught. I was ready to do a million different things. I planned on getting in a fist fight tonight.
 ' Zingano looks in pain as Rousey moves herself into position to execute the armbar finish .
 Rousey has won all 11 of her mixed martial arts fights and all but one inside the first round .
 Rousey celebrates as Zingano is attended to by the referee following her early defeat in Los Angeles .
 Dublin featherweight Conor McGregor (left) and light-heavyweight champion Jon Jones were in attendance .
 Former UFC heavyweight champion Brock Lesnar was Octagon side as Rousey eased to victory .
 For the first time in the promotion's history, two women's fights headlined a pay-per-view event as boxer Holly Holm made her debut with a split-decision victory over Raquel Pennington in the co-main event.
  Middleweight champion Chris Weidman originally was scheduled to fight Vitor Belfort but had to withdraw injured.
 Zingano had earned her title shot two years ago with an upset victory over Miesha Tate, but was forced to wait after suffering a serious knee injury before she was hit by her estranged husband's suicide last year.
 Holm, meanwhile, ended an 11-year pro boxing career to concentrate on MMA two years ago but looked far from the complete package against Pennington. Both fighters landed big shots during a stand-up fight, and while Holm finished with a bloody nose, she left Pennington with a swollen left eye. UFC Octagon girls Vanessa Hanson, Brittney Palmer, Arianny Celeste and Chrissy Blair pose for pictures . Vanessa and Brittney introduce the first round of the respective fights during UFC 184 in Los Angeles . Holly Holm (right) made a winning debut with a split-decision victory over Raquel Pennington . Holm moved from a boxing career to mixed martial arts and remains unbeaten . Pennington lands a left hand on Holm as she battled hard only to lose by split decision . Actresses Mandy Moore (left) and Minka Kelly pose for a photograph during the UFC 184 event . Vin Diesel was also at the Staples Center (left) as UFC president Dana White poses with Mark Wahlberg ."""
print(summarize(text_to_summarize))

ronda rousey beat cat zingano in 14 seconds at staples center in los angeles. the bantamweight champion was expected to face the toughest examination of her reign as bantamweight champion. rousey landed on her head but the champion gracefully flipped zingano on to her back.


In [78]:
# Defining the paths
model_save_path = 'drive/MyDrive/nlp_data/t5_model'
tokenizer_save_path = 'drive/MyDrive/nlp_data/t5_tokenizer'

# Saving the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

('drive/MyDrive/nlp_data/t5_tokenizer/tokenizer_config.json',
 'drive/MyDrive/nlp_data/t5_tokenizer/special_tokens_map.json',
 'drive/MyDrive/nlp_data/t5_tokenizer/spiece.model',
 'drive/MyDrive/nlp_data/t5_tokenizer/added_tokens.json')

In [89]:
# Testing on the first 500 texts
test_df = test_df[:500]
test_texts = ['summarize: ' + text.lower() for text in test_df['article']]
test_summaries = [summary.lower() for summary in test_df['highlights']]
test_encodings = tokenizer(test_texts, return_tensors='tf', padding=True, truncation=True, max_length=512)
test_labels = tokenizer(test_summaries, return_tensors='tf', padding=True, truncation=True, max_length=150)
test_decoder_input_ids = add_decoder_inputs(test_labels['input_ids'])
test_dataset = create_tf_dataset(test_encodings, test_labels['input_ids'], test_decoder_input_ids, batch_size=8)

In [90]:
# Defining evaluation function
def evaluate_model(model, test_dataset, tokenizer):
    references = []
    predictions = []

    for batch in test_dataset:
        input_ids = batch[0]['input_ids'].numpy()
        labels = batch[1].numpy()

        # Generating summaries from the model
        output = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
        output_decoded = tokenizer.batch_decode(output, skip_special_tokens=True)

        # Converting labels to text
        labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Collecting references and predictions for calculating metrics
        references.extend([[ref] for ref in labels_decoded])
        predictions.extend(output_decoded)

    # Calculating BLEU score
    bleu_score = corpus_bleu(references, predictions)

    # Calculating ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref[0], pred)
        for key in scores:
            rouge_scores[key].append(scores[key])

    # Calculating average ROUGE scores of every prediction
    avg_rouge_scores = {key: {
        'precision': sum(score.precision for score in rouge_scores[key]) / len(rouge_scores[key]),
        'recall': sum(score.recall for score in rouge_scores[key]) / len(rouge_scores[key]),
        'fmeasure': sum(score.fmeasure for score in rouge_scores[key]) / len(rouge_scores[key])
    } for key in rouge_scores}

    return bleu_score, avg_rouge_scores

# Evaluating the model
bleu_score, avg_rouge_scores = evaluate_model(model, test_dataset, tokenizer)
print(f"BLEU score: {bleu_score}")
print(f"ROUGE scores:")
for rouge_type, score in avg_rouge_scores.items():
    print(f"{rouge_type}: {score}")

BLEU score: 0.49675795301243225
ROUGE scores:
rouge1: {'precision': 0.3807859837681482, 'recall': 0.4054005709934653, 'fmeasure': 0.38417044890931135}
rouge2: {'precision': 0.1707466754262482, 'recall': 0.18574002956992702, 'fmeasure': 0.1743312485524215}
rougeL: {'precision': 0.27304013767878127, 'recall': 0.2956005579920026, 'fmeasure': 0.27791646851631746}


In [9]:
# new calculation using a different library
from rouge import Rouge

reference_summary = """Ronda Rousey submitted Cat Zingano via armbar inside 14 seconds .
Rousey made a successful fifth defence of her bantamweight title .
The finish is fastest in a UFC title fight and joint-fastest of any UFC fight .
Holly Holm beat Raquel Pennington by split decision on her debut ."""

generated_summary = """ronda rousey beat cat zingano in 14 seconds at staples center in los angeles. 
the bantamweight champion was expected to face the toughest examination of her reign as bantamweight champion. 
rousey landed on her head but the champion gracefully flipped zingano on to her back."""

# Initializing the ROUGE object
rouge = Rouge()
# Calculating ROUGE score for the generated summary  against reference summary
scores = rouge.get_scores(generated_summary, reference_summary)
# Printing the results
print(scores)

[{'rouge-1': {'r': 0.20512820512820512, 'p': 0.24242424242424243, 'f': 0.22222221725694455}, 'rouge-2': {'r': 0.06666666666666667, 'p': 0.06976744186046512, 'f': 0.0681818131844012}, 'rouge-l': {'r': 0.20512820512820512, 'p': 0.24242424242424243, 'f': 0.22222221725694455}}]
