In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc

import torch
import nltk
from datasets import Dataset
from sklearn.model_selection import train_test_split

from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from rouge_score import rouge_scorer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [4]:
train_ds = pd.read_csv("../Dataset/train.csv")
train_ds.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [5]:
len(train_ds)

287113

In [6]:
sampled_df = train_ds.sample(n=500, random_state=42)

In [7]:
train_df, val_df = train_test_split(sampled_df, test_size=0.2, random_state=42)

len(train_df), len(val_df)

(400, 100)

In [8]:
train_dataset = Dataset.from_pandas(train_df[['article', 'highlights']]) 
val_dataset = Dataset.from_pandas(val_df[['article', 'highlights']])

In [9]:
def preprocessing(dataset):
    inputs = tokenizer(dataset['article'],
                      max_length=1024,
                      truncation=True,
                      padding="max_length",
                      return_tensors="pt")

    with tokenizer.as_target_tokenizer():
        outputs = tokenizer(
            dataset['highlights'],
            max_length=128,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    model_inputs = {
        'input_ids' : inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': outputs['input_ids']
    }

    labels = model_inputs['labels'].clone()
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs['labels'] = labels
    return model_inputs

In [10]:
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure

    num_samples = len(decoded_preds)
    for key in rouge_scores:
        rouge_scores[key] = rouge_scores[key] / num_samples
    
    return rouge_scores

In [11]:
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
model.to(device)
print(f"Model Loaded: {model_name}")

Model Loaded: facebook/bart-large-cnn


In [12]:
tokenized_train_df = train_dataset.map(preprocessing, batched=True, batch_size=8)
tokenized_val_df = val_dataset.map(preprocessing, batched=True, batch_size=8)

print(f"training: {tokenized_train_df}")
print(f"training: {tokenized_train_df}")

Map: 100%|███████████████████████████████████████████████████████████████████| 400/400 [00:02<00:00, 152.19 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 145.99 examples/s]

training: Dataset({
    features: ['article', 'highlights', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})
training: Dataset({
    features: ['article', 'highlights', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})





In [13]:
gc.collect()
torch.cuda.empty_cache()

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    fp16=True,  
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_val_df,
    # tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train()
print("Training completed!")

  trainer = Seq2SeqTrainer(


Starting training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,No log,1.261993,0.414907,0.196623,0.27823
2,1.141600,1.28367,0.422065,0.206281,0.296459
3,1.141600,1.370357,0.426575,0.20354,0.287945


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed!


In [19]:
def generate_summary(article, max_length=150, min_length=40, num_beams=4):
    inputs = tokenizer(
        article,
        max_length=1024,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        min_length=min_length,
        num_beams=num_beams,
        length_penalty=2.0,
        early_stopping=True,
        no_repeat_ngram_size=3,  
        repetition_penalty=1.5,  
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [22]:
test_article='''ronda rousey recorded fastest-ever finish ufc title fight submitted cat zingano second los angeles', 'rousey expected face toughest examination reign bantamweight champion unbeaten zingano', 'avoided flying knee opening second rousey took opponent set work trying execute trademark armbar', 'scroll watch rousey beat zingano second', 'ronda rousey manoeuvre position submit cat zingano second fight', 'rousey attempt lock trademark arm bar finish defended bantamweight title', 'rousey console zingano stunning victory inside second staple center los angeles', 'rousey grapple zingano celebrating octagon record-breaking victory', 'ronda rousey bt cat zingano via sub', 'holly holm bt raquel pennington via sd', 'jake ellenberger bt josh koscheck via sub', 'alan jouban bt richard walsh via ko', 'tony ferguson bt gleison tibau via sub', 'roan carneiro bt mark munoz via sub', 'roman salazar bt norifumi yamamoto n/c', 'tim mean bt dhiego lima via tko', 'derrick lewis bt ruan potts via tko', 'valmir lazaro bt james krause via sd', 'masio fullen bt alexander torres via sd', "rousey landed head champion gracefully flipped zingano back got manoeuvred swiftly position wrench zingano 's arm grotesquely", 'rousey forced challenger tap', "'we expecting might come something flying right away rousey said", "'that 's usually land armbar angle work", 'lot like judo transition scramble second hit ground', 'made fly honest', "kind funny going toward ground kind reverted back judo mode thinking 'do n't touch back", "'s point", "'that 's acrobatic thing came thinking touching back judo", 'hard work stunned zingano sell-out staple center', "'she 's really good ... would n't happen beaten challenger said", "'it knee throw scramble wrapped around arm", 'got caught', 'ready million different thing', 'planned getting fist fight tonight', 'zingano look pain rousey move position execute armbar finish', 'rousey mixed martial art fight one inside first round', 'rousey celebrates zingano attended referee following early defeat los angeles', 'dublin featherweight conor mcgregor left light-heavyweight champion jon jones attendance', 'former ufc heavyweight champion brock lesnar octagon side rousey eased victory', "first time promotion 's history two woman 's fight headlined pay-per-view event boxer holly holm made debut split-decision victory raquel pennington co-main event", 'middleweight champion chris weidman originally scheduled fight vitor belfort withdraw injured', "zingano earned title shot two year ago upset victory miesha tate forced wait suffering serious knee injury hit estranged husband 's suicide last year", 'holm meanwhile ended pro boxing career concentrate mma two year ago looked far complete package pennington', 'fighter landed big shot stand-up fight holm finished bloody nose left pennington swollen left eye', 'ufc octagon girl vanessa hanson brittney palmer arianny celeste chrissy blair pose picture', 'vanessa brittney introduce first round respective fight ufc los angeles', 'holly holm right made winning debut split-decision victory raquel pennington', 'holm moved boxing career mixed martial art remains unbeaten', 'pennington land left hand holm battled hard lose split decision', 'actress mandy moore left minka kelly pose photograph ufc event', 'vin diesel also staple center left ufc president dana white pose mark wahlberg'''

summary = generate_summary(test_article)

In [23]:
summary

'ronda rousey recorded fastest-ever finish ufc title fight submitted cat zingano second los angeles .\n\'Zingano stunning victory inside second staple center\'\n\'Rousey\xa0described\xa0her armbar technique as \'lot like judo transition scramble second hit ground\'\n"Holly holm made winning debut split-decision victory raquel pennington co-main event .'

In [24]:
model.save_pretrained("Model&Preprocessor/textSummarizerModel")
tokenizer.save_pretrained("Model&Preprocessor/textSummarizerTokenizer")
print("Model saved to Model&Preprocessor")

Model saved to Model&Preprocessor
