# Setting up Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path= '/content/drive/MyDrive/project/merge_df.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0.1,Unnamed: 0,File_path,Articles,Summaries
0,0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."


# Dataset cleaning

In [4]:
import re
import string

In [5]:
# Function to clean text by removing quotes and special characters
def clean_and_lowercase(text):
  return re.sub(r"[^a-zA-Z0-9]","", text).lower()
text_columns = ['Articles', 'Summaries']
for col in text_columns:
  data[col]= data[col].apply(clean_and_lowercase)
data.head()

Unnamed: 0.1,Unnamed: 0,File_path,Articles,Summaries
0,0,politics,budgettosetsceneforelectiongordonbrownwillseek...,increaseinthestampdutythresholdfrom60000afreez...
1,1,politics,armychiefsinregimentsdecisionmilitarychiefsare...,theyareverymuchnotforthegoodandwilldestroyscot...
2,2,politics,howarddeniessplitoveridcardsmichaelhowardhasde...,michaelhowardhasdeniedhisshadowcabinetwassplit...
3,3,politics,observerstomonitorukelectionministerswillinvit...,thereportsaidindividualregistrationshouldbetre...
4,4,politics,kilroynameselectionseattargetexchatshowhostrob...,ukipsleaderrogerknapmanhassaidheisgladtoseethe...


In [6]:
data.rename(columns= {'File_path': 'Catagories'}, inplace= True)
data.head()

Unnamed: 0.1,Unnamed: 0,Catagories,Articles,Summaries
0,0,politics,budgettosetsceneforelectiongordonbrownwillseek...,increaseinthestampdutythresholdfrom60000afreez...
1,1,politics,armychiefsinregimentsdecisionmilitarychiefsare...,theyareverymuchnotforthegoodandwilldestroyscot...
2,2,politics,howarddeniessplitoveridcardsmichaelhowardhasde...,michaelhowardhasdeniedhisshadowcabinetwassplit...
3,3,politics,observerstomonitorukelectionministerswillinvit...,thereportsaidindividualregistrationshouldbetre...
4,4,politics,kilroynameselectionseattargetexchatshowhostrob...,ukipsleaderrogerknapmanhassaidheisgladtoseethe...


In [7]:
data = data.drop(columns=['Unnamed: 0'])

In [8]:
from typing import Mapping
category_mapping = {
    'politics': 0,
    'business': 1,
    'sport': 2,
}

In [9]:
def encode_category(category):
    return category_mapping.get(category, 3)

# Apply the encoding to the 'Catagories' column
data['Encoded_Categories'] = data['Catagories'].apply(encode_category)
data

Unnamed: 0,Catagories,Articles,Summaries,Encoded_Categories
0,politics,budgettosetsceneforelectiongordonbrownwillseek...,increaseinthestampdutythresholdfrom60000afreez...,0
1,politics,armychiefsinregimentsdecisionmilitarychiefsare...,theyareverymuchnotforthegoodandwilldestroyscot...,0
2,politics,howarddeniessplitoveridcardsmichaelhowardhasde...,michaelhowardhasdeniedhisshadowcabinetwassplit...,0
3,politics,observerstomonitorukelectionministerswillinvit...,thereportsaidindividualregistrationshouldbetre...,0
4,politics,kilroynameselectionseattargetexchatshowhostrob...,ukipsleaderrogerknapmanhassaidheisgladtoseethe...,0
...,...,...,...,...
5444,accidents,hongkonghundredsofpilotwhalesthatswamintoashal...,morethan500rescuerstriedfranticallytosendthepi...,3
5445,sports,nicefrancerivreacceptsthecomplimentbutrejectst...,signingbalotelliwasnotjustawaytogarnerinternat...,3
5446,business,frankfurtgermanswhoneverreallywarmeduptotheeur...,althoughtherewasnoevidenceofthatthebundesbanka...,1
5447,sports,charlesoakleyhasstrongfeelingsaboutcompedticke...,hequestionedwhyanynbafreeagentwouldwanttosignw...,3


In [10]:
data.head()

Unnamed: 0,Catagories,Articles,Summaries,Encoded_Categories
0,politics,budgettosetsceneforelectiongordonbrownwillseek...,increaseinthestampdutythresholdfrom60000afreez...,0
1,politics,armychiefsinregimentsdecisionmilitarychiefsare...,theyareverymuchnotforthegoodandwilldestroyscot...,0
2,politics,howarddeniessplitoveridcardsmichaelhowardhasde...,michaelhowardhasdeniedhisshadowcabinetwassplit...,0
3,politics,observerstomonitorukelectionministerswillinvit...,thereportsaidindividualregistrationshouldbetre...,0
4,politics,kilroynameselectionseattargetexchatshowhostrob...,ukipsleaderrogerknapmanhassaidheisgladtoseethe...,0


In [11]:
unique_Catagories= data['Catagories'].unique()
print(unique_Catagories)

['politics' 'sport' 'tech' 'entertainment' 'business' 'crime' 'technology'
 'art' 'science' 'health' 'architecture' 'lifestyle' 'sports' 'law'
 'accidents' 'environment ']


In [12]:
data = data [data['Catagories'] == 'business']
data.head()

Unnamed: 0,Catagories,Articles,Summaries,Encoded_Categories
1714,business,usconsumerconfidenceupconsumersconfidenceinthe...,walmartthelargestusretailerhassaiditsdecembers...,1
1715,business,thetickingbudgetfacingtheusthebudgetproposalsl...,bruteforcebudgetcutsorspendingcapswouldillserv...,1
1716,business,mitsubishiinpeugeotlinktalkstroublehitmitsubis...,troublehitmitsubishimotorsisintalkswithfrenchc...,1
1717,business,bmwrevealsnewmodelspipelinebmwispreparingtoent...,typicallyittakesaboutthreeyearsfromwhenadecisi...,1
1718,business,worldleadersgathertofaceuncertaintymorethan200...,morethan2000businessandpoliticalleadersfromaro...,1


In [13]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
data.loc[:, 'Tokenized_Articles'] = data['Articles'].apply(lambda x: word_tokenize(x))
data.loc[:, 'Is_Business'] = data['Encoded_Categories'].apply(lambda x: 1 if x == 1 else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'Tokenized_Articles'] = data['Articles'].apply(lambda x: word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'Is_Business'] = data['Encoded_Categories'].apply(lambda x: 1 if x == 1 else 0)


In [15]:
data = data.drop(columns=['Encoded_Categories'])

In [16]:
def remove_emojis_and_symbols(text):
    # Remove emojis
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
    # Remove non-alphanumeric symbols
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [17]:
data['Articles'] = data['Articles'].apply(remove_emojis_and_symbols)
data['Summaries'] = data['Summaries'].apply(remove_emojis_and_symbols)

In [18]:
data.head()

Unnamed: 0,Catagories,Articles,Summaries,Tokenized_Articles,Is_Business
1714,business,usconsumerconfidenceupconsumersconfidenceinthe...,walmartthelargestusretailerhassaiditsdecembers...,[usconsumerconfidenceupconsumersconfidenceinth...,1
1715,business,thetickingbudgetfacingtheusthebudgetproposalsl...,bruteforcebudgetcutsorspendingcapswouldillserv...,[thetickingbudgetfacingtheusthebudgetproposals...,1
1716,business,mitsubishiinpeugeotlinktalkstroublehitmitsubis...,troublehitmitsubishimotorsisintalkswithfrenchc...,[mitsubishiinpeugeotlinktalkstroublehitmitsubi...,1
1717,business,bmwrevealsnewmodelspipelinebmwispreparingtoent...,typicallyittakesaboutthreeyearsfromwhenadecisi...,[bmwrevealsnewmodelspipelinebmwispreparingtoen...,1
1718,business,worldleadersgathertofaceuncertaintymorethan200...,morethan2000businessandpoliticalleadersfromaro...,[worldleadersgathertofaceuncertaintymorethan20...,1


In [19]:
data.head()

Unnamed: 0,Catagories,Articles,Summaries,Tokenized_Articles,Is_Business
1714,business,usconsumerconfidenceupconsumersconfidenceinthe...,walmartthelargestusretailerhassaiditsdecembers...,[usconsumerconfidenceupconsumersconfidenceinth...,1
1715,business,thetickingbudgetfacingtheusthebudgetproposalsl...,bruteforcebudgetcutsorspendingcapswouldillserv...,[thetickingbudgetfacingtheusthebudgetproposals...,1
1716,business,mitsubishiinpeugeotlinktalkstroublehitmitsubis...,troublehitmitsubishimotorsisintalkswithfrenchc...,[mitsubishiinpeugeotlinktalkstroublehitmitsubi...,1
1717,business,bmwrevealsnewmodelspipelinebmwispreparingtoent...,typicallyittakesaboutthreeyearsfromwhenadecisi...,[bmwrevealsnewmodelspipelinebmwispreparingtoen...,1
1718,business,worldleadersgathertofaceuncertaintymorethan200...,morethan2000businessandpoliticalleadersfromaro...,[worldleadersgathertofaceuncertaintymorethan20...,1


In [20]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup , TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_

In [21]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = 'facebook/bart-base'
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [22]:
batch_size = 4
#NUM_PROCS = 4
epoch = 3
max_length = 512

In [23]:

# Define function to tokenize text
def tokenize_text(text):
    try:
        inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length', return_attention_mask=True)
        return inputs.to(device)
    except KeyboardInterrupt:
        print("Tokenization interrupted by user.")
        return None

# Define function to tokenize summary
def tokenize_summary(summary):
    try:
        inputs = tokenizer.encode(summary, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length', return_attention_mask=True)
        return inputs.to(device)
    except KeyboardInterrupt:
        print("Tokenization interrupted by user.")
        return None

# Apply tokenization functions to DataFrame columns
data['TokenizedText'] = data['Articles'].apply(tokenize_text)
data['TokenizedSummary'] = data['Summaries'].apply(tokenize_summary)


In [24]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

X_train = torch.stack([seq.squeeze() for seq in train_df['TokenizedText']])
Y_train = torch.stack([seq.squeeze() for seq in train_df['TokenizedSummary']])
X_test = torch.stack([seq.squeeze() for seq in test_df['TokenizedText']])
Y_test = torch.stack([seq.squeeze() for seq in test_df['TokenizedSummary']])

train_dataset = TensorDataset(X_train, Y_train)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataset = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=4)



In [25]:
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 982
Testing set size: 246


In [26]:
# Create a GradScaler for mixed-precision training
scaler = GradScaler()

# Define training arguments
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the GPU
early_stopping_rounds = 2
best_rouge_score = -1
current_round = 0
accumulation_steps = 20  # Number of gradient accumulation steps
batch_size = 4
epoch = 3
max_length = 512

optimizer = AdamW(model.parameters(), lr=2e-4, weight_decay=0.01, correct_bias=False)  # Use correct_bias=False to match the behavior of Transformers AdamW
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 10)  # Add learning rate scheduler



In [27]:
def train(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(dataloader, desc="Training")):
        inputs = batch[0].to(device)  # Move the input batch to the GPU
        attention_mask = (inputs != 0).float().to(device)  # Create attention mask
        targets = batch[1].to(device)  # Move the target batch to the GPU

        with autocast():
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=targets, labels=targets)
            loss = outputs.loss

        # Perform gradient accumulation
        loss = loss / accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            # Update gradients and optimizer once every accumulation_steps
            clip_grad_norm_(model.parameters(), max_norm=1.0)  # Optional gradient clipping
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [29]:
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm
import torch

# Dummy setup (replace with your actual model, optimizer, and scheduler)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=len(train_dataloader) * 10)  # Add learning rate scheduler
early_stopping_rounds = 2
scaler = GradScaler()

# Move model to the correct device
model.to(device)

# Your training function remains the same as provided

# Epoch loop
for epoch in range(10):
    train_loss = train(model, train_dataloader, optimizer, scheduler)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}")
    if scheduler is not None:
        scheduler.step()  # Adjust or remove based on your scheduler's requirements


Training: 100%|██████████| 246/246 [00:21<00:00, 11.20it/s]


Epoch 1, Train Loss: 0.7235


Training: 100%|██████████| 246/246 [00:21<00:00, 11.21it/s]


Epoch 2, Train Loss: 0.7227


Training: 100%|██████████| 246/246 [00:21<00:00, 11.23it/s]


Epoch 3, Train Loss: 0.7143


Training: 100%|██████████| 246/246 [00:21<00:00, 11.24it/s]


Epoch 4, Train Loss: 0.6798


Training: 100%|██████████| 246/246 [00:22<00:00, 11.14it/s]


Epoch 5, Train Loss: 0.6282


Training: 100%|██████████| 246/246 [00:21<00:00, 11.22it/s]


Epoch 6, Train Loss: 0.5877


Training: 100%|██████████| 246/246 [00:21<00:00, 11.26it/s]


Epoch 7, Train Loss: 0.5640


Training: 100%|██████████| 246/246 [00:21<00:00, 11.24it/s]


Epoch 8, Train Loss: 0.5442


Training: 100%|██████████| 246/246 [00:21<00:00, 11.22it/s]


Epoch 9, Train Loss: 0.5264


Training: 100%|██████████| 246/246 [00:22<00:00, 11.15it/s]

Epoch 10, Train Loss: 0.5074





In [27]:
import pandas as pd

# Define your training arguments or hyperparameters as a dictionary
training_args = {
    "device": "cuda if torch.cuda.is_available() else cpu",
    "model_name": "facebook/bart-base",
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "epsilon": 1e-8,
    "num_warmup_steps": 50,
    "num_training_steps": "len(train_dataloader) * 10",
    "early_stopping_rounds": 2,
    "accumulation_steps": 20,
    "batch_size": 4,
    "max_length": 512,
    "optimizer": "AdamW",
    "scheduler": "get_linear_schedule_with_warmup",
    "scaler": "GradScaler",
    "clip_grad_norm_": "clip_grad_norm_",
    "epoch": 10,

}

# Convert the dictionary to a pandas DataFrame
training_df = pd.DataFrame.from_dict(training_args, orient='index', columns=['Value'])

# Add an index column for better visualization
training_df.index.name = 'Hyperparameter'

# Display the DataFrame
training_df


Unnamed: 0_level_0,Value
Hyperparameter,Unnamed: 1_level_1
device,cuda if torch.cuda.is_available() else cpu
model_name,facebook/bart-base
learning_rate,0.0002
weight_decay,0.01
epsilon,0.0
num_warmup_steps,50
num_training_steps,len(train_dataloader) * 10
early_stopping_rounds,2
accumulation_steps,20
batch_size,4


In [31]:
model.save_pretrained("/content/drive/MyDrive/project/bart_base_model")
tokenizer.save_pretrained("/content/drive/MyDrive/project/bart_base_tokenizer")

('/content/drive/MyDrive/project/bart_base_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/project/bart_base_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/project/bart_base_tokenizer/vocab.json',
 '/content/drive/MyDrive/project/bart_base_tokenizer/merges.txt',
 '/content/drive/MyDrive/project/bart_base_tokenizer/added_tokens.json')

# Evaluation(Rouge)

In [1]:
pip install rouge-score



In [2]:

from rouge_score import rouge_scorer

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = '/content/drive/MyDrive/project/bart_base_model'
tokenizer_path = '/content/drive/MyDrive/project/bart_base_tokenizer'

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
import torch

# Check if a GPU is available and set PyTorch to use the GPU, otherwise use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [4]:
!pip install datasets




In [47]:
!pip install --upgrade pyarrow




In [5]:
import pandas as pd
from datasets import load_metric
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = '/content/drive/MyDrive/project/bart_base_model'

tokenizer_path = '/content/drive/MyDrive/project/bart_base_tokenizer'

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
import torch

# Check if a GPU is available and set PyTorch to use the GPU, otherwise use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Ensure your model and tokenizer are correctly loaded and configured

# Assuming 'data' is your DataFrame with columns 'Articles' and 'Summaries'
# data = pd.read_csv('your_dataset.csv')
path= '/content/drive/MyDrive/project/merge_df.csv'
df= pd.read_csv(path)
df.head()

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate summaries
def generate_summary(article, tokenizer, model, device):
    try:
        inputs = tokenizer(article, return_tensors="pt", max_length=1024, truncation=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        model.eval()
        with torch.no_grad():
            summary_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error in generating summary: {e}")
        return ""

# Initialize the rouge metric
rouge = load_metric('rouge')

# Function to compute ROUGE scores
def compute_detailed_rouge_scores(predictions, references):
    scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    detailed_scores = {
        "ROUGE-1 Precision": scores["rouge1"].mid.precision,
        "ROUGE-1 Recall": scores["rouge1"].mid.recall,
        "ROUGE-1 F-measure": scores["rouge1"].mid.fmeasure,
        "ROUGE-2 Precision": scores["rouge2"].mid.precision,
        "ROUGE-2 Recall": scores["rouge2"].mid.recall,
        "ROUGE-2 F-measure": scores["rouge2"].mid.fmeasure,
    }
    return detailed_scores


# Generate predictions for the first N articles for demonstration purposes
N = 10  # Adjust N to process more articles
predicted_summaries = [generate_summary(article, tokenizer, model, device) for article in df['Articles'].head(N)]

# Get the corresponding reference summaries
reference_summaries = df['Summaries'].head(N).tolist()

# Compute detailed ROUGE scores
rouge_results = compute_detailed_rouge_scores(predicted_summaries, reference_summaries)

# Convert the results to a pandas DataFrame for display
rouge_df = pd.DataFrame(list(rouge_results.items()), columns=['Metric', 'Score'])
rouge_df


  rouge = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Unnamed: 0,Metric,Score
0,ROUGE-1 Precision,0.688022
1,ROUGE-1 Recall,0.413103
2,ROUGE-1 F-measure,0.51398
3,ROUGE-2 Precision,0.486871
4,ROUGE-2 Recall,0.296194
5,ROUGE-2 F-measure,0.366684


In [8]:
excel_file_path = '/content/drive/MyDrive/project/rouge_scores.xlsx'  # Adjust the path as necessary
rouge_df.to_excel(excel_file_path, index=False)

# Inference

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = '/content/drive/MyDrive/project/bart_base_model'

tokenizer_path = '/content/drive/MyDrive/project/bart_base_tokenizer'

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
import torch

# Check if a GPU is available and set PyTorch to use the GPU, otherwise use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [19]:
article = """
Celebrities get to stay in jungleAll four contestants still remain in Im A Celebrity Get Me Out Of Here as no evictions were made on the television show on SaturdayContestants Paul Burrell Joe Pasquale Janet StreetPorter and Fran Cosgrave were told by hosts Ant and Dec Natalie Appletons decision to quit the show last Monday had given them all a stay of execution the group were told Model Sophie Anderton was the last person to be voted off the ITV1 show set in the Australian jungle The four remaining stars will do a joint Bushtucker Trial on SundayFormer All Saints singer Natalie Appleton31 walked out of the show after learning she would face a fifth socalled Bushtucker Trial The celebrities are chosen by the viewers to pass trials in order to win food for the rest of the camp Appleton had endured a torrid time during the programme including a wellpublicised row with Sophie Anderton And on 26 November singer Brian Harvey quit as a contestant after he had a blazing row with Janet StreetPorter
"""

def generate_summary(article, tokenizer, model, device):
    model.eval()
    inputs = tokenizer.encode(article, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs, max_length=1024, min_length=40, length_penalty=2.0, num_beams=17, early_stopping=False)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

print(generate_summary(article, tokenizer, model, device))


Celebrities get to stay in jungleAll four contestants still remain in Im A Celebrity Get Me Out Of Here as no evictions were made on the television show on SaturdayContestants Paul Burrell Joe Pasquale Janet StreetPorter and Fran Cosgrave were told by hosts Ant and Dec Natalie Appletons decision to quit the show last Monday had given them all a stay of execution the group were told Model Sophie Anderton was the last person to be voted off the ITV1 show set in the Australian jungle The four remaining stars will do a joint Bushtucker Trial on SundayFormer All Saints singer Natalie Appleton has walked out of the show after learning she would face a fifth socalled Bushtucking Trial The celebrities are chosen by the viewers to pass trials in order to win food for the rest of the camp Appleton had endured a torrid time during the programme including a wellpublicised row with Sophie anderton And on 26 November singer Brian Harvey quit as a contestant after he had a blazing row with Janet StPo

In [20]:
generated_summary = generate_summary(article, tokenizer, model, device)

reference_summary = """Former All Saints singer Natalie Appleton31 walked out of the show after learning she would face a fifth socalled Bushtucker TrialAll four contestants still remain in Im A Celebrity Get Me Out Of Here as no evictions were made on the television show on SaturdayAnd on 26 November singer Brian Harvey quit as a contestant after he had a blazing row with Janet StreetPorterModel Sophie Anderton was the last person to be voted off the ITV1 show set in the Australian jungle"""


In [21]:
!pip install rouge_score




In [24]:
import pandas as pd
from datasets import load_metric
from transformers import BartForConditionalGeneration, BartTokenizer
import torch


# Assuming 'data' is your DataFrame with columns 'Articles' and 'Summaries'
# data = pd.read_csv('your_dataset.csv')
path= '/content/drive/MyDrive/project/merge_df.csv'
df= pd.read_csv(path)
df.head()

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate summaries
def generate_summary(article, tokenizer, model, device):
    try:
        inputs = tokenizer(article, return_tensors="pt", max_length=1024, truncation=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        model.eval()
        with torch.no_grad():
            summary_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error in generating summary: {e}")
        return ""

# Initialize the rouge metric
rouge = load_metric('rouge')

# Function to compute ROUGE scores
def compute_detailed_rouge_scores(predictions, references):
    scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    detailed_scores = {
        "ROUGE-1 Precision": scores["rouge1"].mid.precision,
        "ROUGE-1 Recall": scores["rouge1"].mid.recall,
        "ROUGE-1 F-measure": scores["rouge1"].mid.fmeasure,
        "ROUGE-2 Precision": scores["rouge2"].mid.precision,
        "ROUGE-2 Recall": scores["rouge2"].mid.recall,
        "ROUGE-2 F-measure": scores["rouge2"].mid.fmeasure,
    }
    return detailed_scores


# Generate predictions for the first N articles for demonstration purposes
N = 10  # Adjust N to process more articles
predicted_summaries = [generate_summary(article, tokenizer, model, device) for article in df['Articles'].head(N)]

# Get the corresponding reference summaries
reference_summaries = df['Summaries'].head(N).tolist()

# Compute detailed ROUGE scores
rouge_results = compute_detailed_rouge_scores(predicted_summaries, reference_summaries)

# Convert the results to a pandas DataFrame for display
rouge_df = pd.DataFrame(list(rouge_results.items()), columns=['Metric', 'Score'])
rouge_df

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Unnamed: 0,Metric,Score
0,ROUGE-1 Precision,0.688022
1,ROUGE-1 Recall,0.413103
2,ROUGE-1 F-measure,0.51398
3,ROUGE-2 Precision,0.486871
4,ROUGE-2 Recall,0.296194
5,ROUGE-2 F-measure,0.366684
