# BART

https://huggingface.co/facebook/bart-base

## Import libraries

In [1]:
!pip install evaluate
!pip install rouge
!pip install rouge-score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, mu

In [2]:
import nltk # Imports the library
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import os

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from rouge_score import rouge_scorer
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Read dataframe from drive folder

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/Text Mining Project/raw_data.csv')

In [5]:
df.head()

Unnamed: 0,category,article_id,text,summary
0,business,71,Japan turns to beer alternatives\n \n Japanese...,Asahi is predicting profits to rise 50% in 200...
1,business,394,US Airways staff agree to pay cut\n \n A union...,The seventh largest carrier in the US sought b...
2,business,129,Iraq to invite phone licence bids\n \n Iraq is...,The ministry said that it wanted to increase I...
3,business,463,US economy still growing says Fed\n \n Most ar...,Most areas of the US saw their economy continu...
4,business,177,Optimism remains over UK housing\n \n The UK p...,"Wimpey said the UK housing market had proved ""..."


## Tokenize and preprocess the text data

In [6]:
# Define the device for GPU usage (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenize and preprocess the text data
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
max_length = 512  # Maximum sequence length

def tokenize_text(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True, padding='max_length', return_attention_mask=True)
    return inputs.to(device)  # Move the tokenized inputs to the GPU

def tokenize_summary(text):
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=280, truncation=True, padding='max_length', return_attention_mask=True)
    return inputs.to(device)  # Move the tokenized summaries to the GPU


df['TokenizedText'] = df['text'].apply(tokenize_text)
df['TokenizedSummary'] = df['summary'].apply(tokenize_summary)

# Split your data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert tokenized data to PyTorch tensors
X_train = torch.stack([seq.squeeze() for seq in train_df['TokenizedText']])
Y_train = torch.stack([seq.squeeze() for seq in train_df['TokenizedSummary']])
X_test = torch.stack([seq.squeeze() for seq in test_df['TokenizedText']])
Y_test = torch.stack([seq.squeeze() for seq in test_df['TokenizedSummary']])

# Define a DataLoader for batching data
train_dataset = TensorDataset(X_train, Y_train)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataset = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## BART model

In [7]:
# Define the BART model
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')

# Create a GradScaler for mixed-precision training
scaler = GradScaler()

# Define hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the GPU
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=len(train_dataloader) * 10)  # Add learning rate scheduler
early_stopping_rounds = 2
best_rouge_score = -1
current_round = 0

# Define gradient accumulation steps
accumulation_steps = 20

def train(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(dataloader, desc="Training")):
        inputs = batch[0].to(device)  # Move the input batch to the GPU
        attention_mask = (inputs != 0).float().to(device)  # Create attention mask
        targets = batch[1].to(device)  # Move the target batch to the GPU

        with autocast():
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=targets, labels=targets)
            loss = outputs.loss

        # Perform gradient accumulation
        loss = loss / accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            # Update gradients and optimizer once every accumulation_steps
            clip_grad_norm_(model.parameters(), max_norm=1.0)  # Optional gradient clipping
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def calculate_rouge1_precision(logits, targets):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    rouge1_precision = 0.0
    num_samples = len(logits)

    for i in range(num_samples):
        predicted_ids = logits[i].cpu().numpy()
        target_ids = targets[i].cpu().numpy()

        # Convert token IDs to strings
        predicted_text = tokenizer.decode(predicted_ids, skip_special_tokens=True)
        target_text = tokenizer.decode(target_ids, skip_special_tokens=True)

        # Calculate ROUGE-1 precision
        scores = scorer.score(predicted_text, target_text)
        rouge1_precision += scores['rouge1'].precision

    return rouge1_precision / num_samples

# Training loop
for epoch in range(10):  # Change the number of epochs as needed
    train_loss = train(model, train_dataloader, optimizer, scheduler)
    print(f"Epoch {epoch+1}/{9}, Train Loss: {train_loss:.4f}")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Training: 100%|██████████| 445/445 [01:22<00:00,  5.40it/s]


Epoch 1/9, Train Loss: 0.6415


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 2/9, Train Loss: 0.6414


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 3/9, Train Loss: 0.6396


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 4/9, Train Loss: 0.6409


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 5/9, Train Loss: 0.6403


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 6/9, Train Loss: 0.6399


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 7/9, Train Loss: 0.6392


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 8/9, Train Loss: 0.6398


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 9/9, Train Loss: 0.6410


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]

Epoch 10/9, Train Loss: 0.6398





## Evaluation

In [8]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [9]:
def evaluate(model, dataloader):
    model.eval()

    test_articles = []
    actual_summaries = []
    predicted_summaries = []
    rouge1_precision_scores = []

    scorer = rouge_scorer.RougeScorer(['rouge1'])

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating Test"):
            inputs = batch[0].to(device)
            attention_mask = (inputs != 0).float().to(device)
            targets = batch[1].to(device)
            outputs = model.generate(input_ids=inputs, attention_mask=attention_mask, max_length=150, num_beams=17, length_penalty=2.0, early_stopping=False)

            for output, target, input_text in zip(outputs, targets, inputs):
                # Calculate ROUGE-1 precision for each sample
                output_text = tokenizer.decode(output, skip_special_tokens=True)
                target_text = tokenizer.decode(target, skip_special_tokens=True)
                rouge_scores = scorer.score(output_text, target_text)
                rouge1_precision_scores.append(rouge_scores['rouge1'].precision)

                # Append tokenized text, actual summaries, and predicted summaries
                test_articles.append(tokenizer.decode(input_text, skip_special_tokens=True))
                actual_summaries.append(target_text)
                predicted_summaries.append(output_text)

    return test_articles, actual_summaries, predicted_summaries, rouge1_precision_scores

# Evaluate the model
test_articles, actual_summaries, predicted_summaries, rouge1_precision_scores = evaluate(model, test_dataloader)

# Create a dictionary with the extracted data
data = {
    'Article': test_articles,
    'Actual Summary': actual_summaries,
    'Predicted Summary': predicted_summaries,
    'ROUGE-1 Precision': rouge1_precision_scores,
}

# Create a Pandas DataFrame from the dictionary
results_df = pd.DataFrame(data)

# Display the DataFrame
pd.set_option('display.max_colwidth', None)
results_df.head(5)

Evaluating Test: 100%|██████████| 112/112 [10:47<00:00,  5.78s/it]


Unnamed: 0,Article,Actual Summary,Predicted Summary,ROUGE-1 Precision
0,"summarize: Tsunami 'to hit Sri Lanka banks'\n \n Sri Lanka's banks face hard times following December's tsunami disaster, officials have warned.\n \n The Sri Lanka Banks Association said the waves which killed more than 30,000 people also washed away huge amounts of property which was securing loans. According to its estimate, as much as 13.6% of the loans made by private banks to clients in the disaster zone has been written off or damaged. State-owned lenders may be even worse hit, it said.\n \n The association estimates that the private banking sector has 25bn rupees ($250m; £135m) of loans outstanding in the disaster zone. On one hand, banks are dealing with the death of their customers, along with damaged or destroyed collateral. On the other, most are extending cheap loans for rebuilding and recovery, as well as giving their clients more time to repay existing borrowing. The combination means a revenue shortfall during 2005, SLBA chairman - and Commercial Bank managing director - AL Gooneratne told a news conference. ""Most banks have given moratoriums and will not be collecting interest, at least in this quarter,"" he said. In the public sector, more than one in ten of the state-owned People's Bank's customers in the south of Sri Lanka were affected, a bank spokesman told Reuters. He estimated the bank's loss at 3bn rupees.\n","According to its estimate, as much as 13.6% of the loans made by private banks to clients in the disaster zone has been written off or damaged.In the public sector, more than one in ten of the state-owned People's Bank's customers in the south of Sri Lanka were affected, a bank spokesman told Reuters.The association estimates that the private banking sector has 25bn rupees ($250m; £135m) of loans outstanding in the disaster zone.Sri Lanka's banks face hard times following December's tsunami disaster, officials have warned.""Most banks have given moratoriums and will not be collecting interest, at least in this quarter,"" he said.","summarize: Tsunami 'to hit Sri Lanka banks' to be even worse hit, it said. Please calm down, it doesn’t be as hard as when it’s on a rainy day in the capital, it is only worse hit. It is not as much as what is on the bank bank, it says. The Sri Lanka Banks Association said the waves which killed more than 30,000 people also washed away huge amounts of property which was securing loans. According to its estimate, a much as 13.6% of the loans made by private banks to clients in the disaster zone has been written off or damaged. State-owned lenders may be even bigger hit, its said. Poker ا�",0.5
1,"summarize: Jansen suffers a further setback\n \n Blackburn striker Matt Jansen faces three weeks out after surgery to treat a cartilage problem.\n \n But central defender Lorenzo Amoruso is moving closer to fitness following a knee operation. Rovers' assistant manager Mark Bowen said: ""Matt had a small operation to trim knee cartilage. ""It's a tiny piece of work, which should be a fairly quick recovery. Lorenzo is also jogging for the first time, along with kicking a ball."" Jansen's career has been dogged by injury since a freak scooter accident two years ago.\n \n He returned to first-team action soon after Mark Hughes' appointment as Blackburn boss and marked it with a goal against Portsmouth in his first appearance of the season. Bowen added: ""I'm guessing, but I reckon maybe two to three weeks before he is back in action completely."" The Rovers assistant boss forecast a longer time spell for Amoruso's availability for first-team duties. Bowen said: ""There's still some scar tissue present so it will be some weeks. ""It's a case of see how he goes. You can't put a real time on a comeback, we'll see how he progresses.""\n","Rovers' assistant manager Mark Bowen said: ""Matt had a small operation to trim knee cartilage.Bowen added: ""I'm guessing, but I reckon maybe two to three weeks before he is back in action completely.""The Rovers assistant boss forecast a longer time spell for Amoruso's availability for first-team duties.You can't put a real time on a comeback, we'll see how he progresses.""He returned to first-team action soon after Mark Hughes' appointment as Blackburn boss and marked it with a goal against Portsmouth in his first appearance of the season.","summarize: Jansen suffers a further setback, along with kicking a ball, as he is wearing a jersey wearing wearing a shirt wearing a football jersey. �� Blackburn striker Matt Jansen faces three weeks out after surgery to treat a cartilage problem. † �� But central defender Lorenzo Amoruso is moving closer to fitness following a knee operation. Rovers' assistant manager Mark Bowen said: ""Matt had a small operation to trim knee cartilage. ""It's a tiny piece of work, which should be a fairly quick recovery. Lorenzo is also jogging for the first time, along a rugby a ball."" Jansen's career has been dogged by injury since a freak scooter accident two years ago. He",0.412371
2,"summarize: China 'to overtake US net use'\n \n The Chinese net-using population looks set to exceed that of the US in less than three years, says a report.\n \n China's net users number 100m but this represents less than 8% of the country's 1.3 billion people. Market analysts Panlogic predicts that net users in China will exceed the 137 million US users of the net by 2008. The report says that the country's culture will mean that Chinese people will use the net for very different ends than in many other nations.\n \n Already net use in China has a very different character than in many Western nations, said William Makower, chief executive of Panlogic. In many Western nations desktop computers that can access the net are hard to escape at work. By contrast in China workplace machines are relatively rare. This, combined with the relatively high cost of PCs in China and the time it takes to get phone lines installed, helps to explains the huge number of net cafes in China. Only 36% of Chinese homes have telephones according to reports. ""Net usage tends to happen in the evening,"" said Mr Makower, ""they get access only when they go home and go off to the internet caf&#233;."" ""Its fundamentally different usage to what we have here,"" he said.\n \n Net use in China was still very much an urban phenomenon with most users living on the country's eastern seaboard or in its three biggest cities. The net is key to helping Chinese people keep in touch with friends, said Mr Makower. Many people use it in preference to the phone or arrange to meet up with friends at net cafes. What people can do on the net is also limited by aspects of Chinese life. For instance, said Mr Makower, credit cards are rare in China partly because of fears people have about getting in to debt. ""The most popular way to pay is Cash-On-Delivery,"" he said, ""and that's quite a brake to the development of e-commerce."" The arrival of foreign banks in China, due in 2006, could mean greater use of credit cards but for the moment they are rare, said Mr Makower. But if Chinese people are not spending cash online they are interested in the news they can get via the net and the view it gives them on Western ways of living. ""A large part of the attraction of the internet is that it goes below the radar,"" he said. ""Generally it's","The net is key to helping Chinese people keep in touch with friends, said Mr Makower.Already net use in China has a very different character than in many Western nations, said William Makower, chief executive of Panlogic.The report says that the country's culture will mean that Chinese people will use the net for very different ends than in many other nations.What people can do on the net is also limited by aspects of Chinese life.For instance, said Mr Makower, credit cards are rare in China partly because of fears people have about getting in to debt.Market analysts Panlogic predicts that net users in China will exceed the 137 million US users of the net by 2008.Government restrictions on how much advertising can appear on television means that the net is a source of many commercial messages Chinese people would not see anywhere else.Many people use it in preference to the phone or arrange to meet up with friends at net cafes.The arrival of foreign banks in China, due in 2006, could mean greater use of credit cards but for the moment they are rare, said Mr Makower.China's net users number 100m but this represents less than 8% of the country's 1.3 billion people.","summarize: China 'to overtake US net use'http Â �� The Chinese net-using population looks set to exceed that of the US in less than three years, says a report.http �� �� China's net users number 100m but this represents less than 8% of the country's 1.3 billion people. Market analysts Panlogic predicts that net users in China will exceed the 137 million US users of the net by 2008. The report says that the country’s culture will mean that Chinese people will use the net for very different ends than in many other nations.""It's very much a very different character than in China and the vast number of net cafes in China has a very",0.444444
3,"summarize: Virgin Radio offers 3G broadcast\n \n UK broadcaster Virgin Radio says it will become the first station in the world to offer radio via 3G mobiles.\n \n The radio station, in partnership with technology firm Sydus, will broadcast on selected 2G and high-speed 3G networks. Later this year listeners will be able to download software from the Virgin website which enables the service. James Cridland, head of new media at Virgin Radio, said: ""It places radio at the heart of the 3G revolution."" Virgin Radio will be the first station made available followed by two digital stations, Virgin Radio Classic Rock and Virgin Radio Groove.\n \n Mr Cridland said: ""This application will enable anyone, anywhere to listen to Virgin Radio simply with the phone in their pocket. ""This allows us to tap into a huge new audience and keep radio relevant for a new generation of listeners."" Saumil Nanavati, president of Sydus, said, ""This radio player is what the 3G network was built for, giving consumers high-quality and high-data products through a handset in their pocket."" Virgin says an hour's listening to the station via mobile would involve about 7.2MB of data, which could prove expensive for people using pay as you download GPRS or 3G services. Some networks, such as Orange, charge up to £1 for every one megabyte of data downloaded. Virgin says radio via 2G or 3G mobiles is therefore going to appeal to people with unlimited download deals. There are 30 compatible handsets available from major manufacturers including Nokia and Samsung while Virgin said more than 14.9 million consumers across the globe can use the service currently.\n","James Cridland, head of new media at Virgin Radio, said: ""It places radio at the heart of the 3G revolution.""UK broadcaster Virgin Radio says it will become the first station in the world to offer radio via 3G mobiles.Virgin Radio will be the first station made available followed by two digital stations, Virgin Radio Classic Rock and Virgin Radio Groove.Virgin says radio via 2G or 3G mobiles is therefore going to appeal to people with unlimited download deals.The radio station, in partnership with technology firm Sydus, will broadcast on selected 2G and high-speed 3G networks.","summarize: Virgin Radio offers 3G broadcast on selected 2G and high-speed 3G networks. Later this year listeners will be able to download software from the Virgin website which enables the service. James Cridland, head of new media at Virgin Radio, said: ""It places radio at the heart of the 3G revolution."" Virgin Radio will be the first station made available followed by two digital stations, Virgin Radio Classic Rock and Virgin Radio Groove. It’s the only radio station, in partnership with technology firm Sydus, will be “This application will enable anyone, anywhere to listen to Virgin Radio simply with the phone in their pocket. ""This allows us to tap into a huge new audience and",0.76
4,"summarize: Doves soar to UK album summit\n \n Manchester rock band Doves have entered the UK album chart at number one with their new release, Some Cities.\n \n The trio replace flamboyant US act Scissor Sisters at the top. The album follows single Black and White Town, which reached number six. R&B star Nelly has the new number one single with Over and Over, which sees him team up with Tim McGraw. Girls Aloud, Akon and Kaiser Chiefs all have new singles in the top ten, as do Futureheads and Usher.\n \n The latest Elvis Presley re-release, (Marie's The Name) His Latest Flame, entered the chart at number three, one place ahead of Girls Aloud's Wake Me Up. Hip-hop performer Akon's Locked Up is at number five, while hotly-tipped Leeds band Kaiser Chiefs have their second chart hit at number six with Oh My God.\n \n Futureheads' cover of Kate Bush's Hounds of Love entered the chart at number eight, while Usher's Caught Up was a new entry at number nine. In the album chart, operatic quartet Il Divo's eponymous debut rose 23 places to number six, while crooner Tony Christie's Definitive Collection is a new entry at number 10, making it the highest-charting album of the singer's career.\n","Manchester rock band Doves have entered the UK album chart at number one with their new release, Some Cities.Futureheads' cover of Kate Bush's Hounds of Love entered the chart at number eight, while Usher's Caught Up was a new entry at number nine.In the album chart, operatic quartet Il Divo's eponymous debut rose 23 places to number six, while crooner Tony Christie's Definitive Collection is a new entry at number 10, making it the highest-charting album of the singer's career.Hip-hop performer Akon's Locked Up is at number five, while hotly-tipped Leeds band Kaiser Chiefs have their second chart hit at number six with Oh My God.","summarize: Doves soar to UK album at number one with their new release, Some Cities. Please Â  Ak Akon, The trio replace flamboyant US act Scissor Sisters at the top. The album follows single Black and White Town, which reached number six. R&B star Nelly has the new number one single with Over and Over, which sees him team up with Tim McGraw. Players Aloud, Akon and Kaiser Chiefs all have new singles in the top ten, as do Futureheads and Usher. Get �� The latest Elvis Presley re-release, (Marie's The Name) His Latest Flame, entered the chart at number three, one place",0.330508


In [10]:
pd.set_option('display.max_colwidth', 100)

results_df.head(20)

Unnamed: 0,Article,Actual Summary,Predicted Summary,ROUGE-1 Precision
0,summarize: Tsunami 'to hit Sri Lanka banks'\n \n Sri Lanka's banks face hard times following Dec...,"According to its estimate, as much as 13.6% of the loans made by private banks to clients in the...","summarize: Tsunami 'to hit Sri Lanka banks' to be even worse hit, it said. Please calm down, it ...",0.5
1,summarize: Jansen suffers a further setback\n \n Blackburn striker Matt Jansen faces three weeks...,"Rovers' assistant manager Mark Bowen said: ""Matt had a small operation to trim knee cartilage.Bo...","summarize: Jansen suffers a further setback, along with kicking a ball, as he is wearing a jerse...",0.412371
2,summarize: China 'to overtake US net use'\n \n The Chinese net-using population looks set to exc...,"The net is key to helping Chinese people keep in touch with friends, said Mr Makower.Already net...",summarize: China 'to overtake US net use'http Â �� The Chinese net-using population looks set to...,0.444444
3,summarize: Virgin Radio offers 3G broadcast\n \n UK broadcaster Virgin Radio says it will become...,"James Cridland, head of new media at Virgin Radio, said: ""It places radio at the heart of the 3G...",summarize: Virgin Radio offers 3G broadcast on selected 2G and high-speed 3G networks. Later thi...,0.76
4,summarize: Doves soar to UK album summit\n \n Manchester rock band Doves have entered the UK alb...,"Manchester rock band Doves have entered the UK album chart at number one with their new release,...","summarize: Doves soar to UK album at number one with their new release, Some Cities. Please Â  ...",0.330508
5,summarize: Prodigy join V Festival line-up\n \n Essex act Prodigy are to headline the second sta...,"A month later at the V Festival, Prodigy will play at Weston Park on Saturday 20 August and Hyla...",summarize: Prodigy join V Festival line-up alongside Athlete and Green Day. The Manchester band ...,0.431373
6,"summarize: Nat Insurance to rise, say Tories\n \n National Insurance will be raised if Labour wi...",Tony Blair has said he does not want higher tax rates for top earners but on Wednesday said othe...,"summarize: Nat Insurance to rise, say Tories to lose if Labour wins the next election, Tory lead...",0.538071
7,summarize: Umbro profits lifted by Euro 2004\n \n UK sportswear firm Umbro has posted a 222% ris...,UK sportswear firm Umbro has posted a 222% rise in annual profit after sales of replica England ...,"summarize: Umbro profits lifted by Euro 2004 tournament.fc iced by UK sportswear firm Umbro, whi...",0.572581
8,"summarize: Mobile TV tipped as one to watch\n \n Scandinavians and Koreans, two of the most adve...","A speech on mobile TV by Angel Gambino of the BBC also drew a large crowd, suggesting that even ...","summarize: Mobile TV tipped as one to watch at the 3GSM World Congress, a mobile trade fair, in ...",0.298319
9,summarize: Moya sidesteps Davis Cup in 2005\n \n Carlos Moya has chosen not to help Spain try an...,"Moya led Spain to victory over the USA but wants to focus on the Grand Slams in 2005, although i...","summarize: Moya sidesteps Davis Cup in 2005, he doesn’t want to help Spain try and defend the Da...",0.69


In [11]:
# Evaluation using BLEU and Rouge
references = results_df['Actual Summary'].tolist()
hypotheses = results_df['Predicted Summary'].tolist()

# BLEU score
bleu_scores = [sentence_bleu([ref.split()], hyp.split()) for ref, hyp in zip(references, hypotheses)]
average_bleu = sum(bleu_scores) / len(bleu_scores)

# Rouge score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
average_rouge = {
    'rouge1': sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores),
    'rouge2': sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores),
    'rougeL': sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores),
}

# Print or use the scores as needed
print(f"Average BLEU Score: {average_bleu}")
print(f"Average Rouge Scores: {average_rouge}")

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.2637898469437157
Average Rouge Scores: {'rouge1': 0.5223415920414372, 'rouge2': 0.36565577527877463, 'rougeL': 0.3328779792167499}


We save the dataframe with the summaries generated by pre-trained BART model.

In [12]:
df.to_csv('BART_summarized.csv', index=False)