In [1]:
import torch
import nltk
import numpy as np
import pandas as pd
from sympy import false
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download necessary NLP tools
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [57]:
# Set device (MPS if available, otherwise use CPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

In [9]:

# Load BBC News Dataset 
df = pd.read_csv("bbc_news_summary_with_articles.csv")

In [11]:
df.head()

Unnamed: 0,Title,Article,Summary,Category
0,289,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment
1,262,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...,entertainment
2,276,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment
3,60,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...,entertainment
4,74,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin...",entertainment


In [12]:
df=df.sample(frac=0.05, replace=True, random_state=1)

In [13]:
df.size

444

In [14]:
# Preprocess text
def preprocess_text(text):
    return nltk.tokenize.sent_tokenize(text)

# Generate sentence embeddings
def get_sentence_embeddings(sentences):
    embeddings = sbert_model.encode(sentences, convert_to_tensor=True)
    return embeddings.to(device)  # Ensure embeddings are moved to the same device

# Define Reinforcement Learning Agent
class RLAgent(torch.nn.Module):
    def __init__(self, embedding_dim, num_sentences):
        super(RLAgent, self).__init__()
        self.fc = torch.nn.Linear(embedding_dim, num_sentences).to(device)  # Move to device
        self.softmax = torch.nn.Softmax(dim=0)
    
    def forward(self, embeddings):
        scores = self.fc(embeddings)
        return self.softmax(scores)

# Reward Function (ROUGE + Diversity Score)
def reward_function(selected_sentences, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
    rouge_scores = np.mean([scorer.score(reference_summary, sent)['rouge1'].fmeasure for sent in selected_sentences])
    diversity_score = len(set(selected_sentences)) / max(1, len(selected_sentences))
    return rouge_scores + 0.5 * diversity_score  # Weighted sum

def compute_reward(generated_summary, reference_summary):
    rouge_score = compute_rouge(generated_summary, reference_summary)
    diversity_score = compute_diversity(generated_summary)
    readability_score = compute_readability(generated_summary)
    return 0.5 * rouge_score + 0.3 * diversity_score + 0.2 * readability_score


In [15]:
# Train RL Agent
def train_rl_agent(df, num_epochs=3, learning_rate=0.01):
    agent = RLAgent(384, 10).to(device)  # Ensure model runs on GPU if available
    optimizer = torch.optim.Adam(agent.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        total_reward = 0
        for _, row in tqdm(df.iterrows(), total=len(df)):
            article, reference_summary = row['Article'], row['Summary']
            sentences = preprocess_text(article)
            embeddings = get_sentence_embeddings(sentences)
            
            optimizer.zero_grad()
            
            probs = agent(embeddings)  # Get probability scores for each sentence
            probs = probs.squeeze()  # Remove extra dimensions if needed

            # ✅ Ensure probabilities are valid (Avoids division errors)
            if probs.numel() == 0 or torch.all(probs == 0):
                print("Skipping due to empty or zero probability distribution.")
                continue
            
            # ✅ Ensure num_samples is within a valid range
            num_sentences = len(sentences)
            num_samples = min(max(num_sentences // 3, 1), len(probs))

            # ✅ Check if we have enough elements to sample
            # if num_samples <= len(probs):  
            #     chosen_indices = torch.multinomial(probs, num_samples=num_samples, replacement=False)
            # else:
            #     chosen_indices = torch.arange(len(probs))  # Select all available indices
            chosen_indices = torch.arange(len(probs))
            # ✅ Convert chosen indices to a flat list
            chosen_indices = chosen_indices.cpu().numpy().flatten().tolist()
            selected_sentences = [sentences[i] for i in chosen_indices]
            
            reward = reward_function(selected_sentences, reference_summary)
            loss = -torch.log(probs[chosen_indices] + 1e-8).sum() * reward  # Avoid log(0)
            
            loss.backward()
            optimizer.step()
            total_reward += reward
        
        print(f"Epoch {epoch+1}: Avg Reward = {total_reward / len(df):.4f}")
    
    return agent




In [9]:
trained_agent = train_rl_agent(df)

100%|██████████| 2225/2225 [06:17<00:00,  5.89it/s]


Epoch 1: Avg Reward = 0.6729


100%|██████████| 2225/2225 [02:40<00:00, 13.87it/s]


Epoch 2: Avg Reward = 0.6729


100%|██████████| 2225/2225 [02:40<00:00, 13.90it/s]

Epoch 3: Avg Reward = 0.6729





In [13]:
# Evaluate model performance
def evaluate_model(agent, df,type):
    total_rouge_score = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        article, reference_summary = row['Article'], row['Summary']
        sentences = preprocess_text(article)
        embeddings = get_sentence_embeddings(sentences)

        probs = agent(embeddings).squeeze()  # Get probability scores
        probs = probs.detach().cpu()  # Move to CPU for processing

        # ✅ Ensure probabilities are valid
        if probs.numel() == 0 or torch.all(probs == 0):
            print("Skipping due to empty or zero probability distribution.")
            continue

        # ✅ Ensure valid number of sentences
        num_samples = min(max(len(sentences) // 3, 1), len(probs))

        # ✅ Get selected sentence indices
        # if num_samples <= len(probs):  
        #     chosen_indices = torch.multinomial(probs, num_samples=num_samples, replacement=False)
        # else:
        chosen_indices = torch.arange(len(probs))  # Select all available indices

        # ✅ Convert `chosen_indices` to a **flat list of integers**
        chosen_indices = chosen_indices.cpu().numpy().flatten().tolist()

        # ✅ Fix: Ensure `chosen_indices` are integers
        selected_sentences = [sentences[int(i)] for i in chosen_indices]  # Convert to int before indexing

        # Compute ROUGE score
        generated_summary = " ".join(selected_sentences)
        rouge_score = compute_rouge(generated_summary, reference_summary,type)
        total_rouge_score += rouge_score

    avg_rouge = total_rouge_score / len(df)
    print(f"Average ROUGE Score {type}: {avg_rouge:.4f}")
    return avg_rouge


In [14]:
from rouge_score import rouge_scorer

def compute_rouge(generated_summary, reference_summary,type):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)

    # Compute the average ROUGE score
    rouge1 = scores['rouge1'].fmeasure
    rouge2 = scores['rouge2'].fmeasure
    rougeL = scores['rougeL'].fmeasure
    avg_rouge = (rouge1 + rouge2 + rougeL) / 3  # Average ROUGE score
    if(type=="rouge1"):
        return rouge1
    if(type=="rouge2"):
        return rouge2
    if(type=="rougeL"):
        return rougeL
    
    return avg_rouge


In [15]:
evaluate_model(trained_agent, df,"rouge1")
evaluate_model(trained_agent, df, "rouge2")
evaluate_model(trained_agent, df, "rougeL")
evaluate_model(trained_agent, df, "")

100%|██████████| 2225/2225 [01:35<00:00, 23.28it/s]


Average ROUGE Score rouge1: 0.6162


100%|██████████| 2225/2225 [01:34<00:00, 23.50it/s]


Average ROUGE Score rouge2: 0.5949


100%|██████████| 2225/2225 [01:35<00:00, 23.30it/s]


Average ROUGE Score rougeL: 0.3843


100%|██████████| 2225/2225 [01:34<00:00, 23.59it/s]

Average ROUGE Score : 0.5318





0.531794009110535

In [None]:
# finetune SBERT    

In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, datasets

from datasets import load_dataset
import random
import numpy as np
from rouge_score import rouge_scorer

In [64]:
df.head()

Unnamed: 0,Title,Article,Summary,Category
1061,405,Ireland 19-13 England\n\nIreland consigned Eng...,O'Gara missed a penalty which would have put I...,sport
235,352,Vera Drake's Bafta triumph hope\n\nAt the Baft...,"""If Mike Leigh is going to win awards for anyt...",entertainment
1096,38,Radcliffe proves doubters wrong\n\nThis won't ...,And a lot of people were wondering what would ...,sport
905,100,Mido makes third apology\n\nAhmed 'Mido' Hossa...,Shalaby earlier said that after an apology Mid...,sport
960,502,Minister digs in over doping row\n\nThe Belgia...,Dechy said.The Belgian sports minister at the ...,sport


In [65]:
train_df=df

In [66]:
train_df=df[['Article', 'Summary']]

In [93]:
train_df.dropna(subset=['Article', 'Summary'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.dropna(subset=['Article', 'Summary'], inplace=True)


In [111]:
import torch
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import nltk
import os
# Download NLTK packages
nltk.download("punkt")

# Set the environment variable to allow more memory usage on MPS (Metal Performance Shaders)
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
# Load BBC News dataset (replace with actual dataset path if local)
dataset = load_dataset("csv", data_files="bbc_news_summary_with_articles.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [116]:
# Split dataset into train/test
dataset = dataset["train"].train_test_split(test_size=0.1)

# Load pre-trained BART tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Preprocess the dataset
def preprocess_function(examples):
    # Tokenize the input articles
    model_inputs = tokenizer(examples["Article"], max_length=512, truncation=True, padding="max_length")
    
    # Tokenize the summaries as target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=100, truncation=True, padding="max_length")
    
    # Add labels to the inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the BART model for summarization
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Set device to MPS or CPU
device = torch.device("mps" if torch.has_mps else "cpu")  # Use MPS if available, otherwise fallback to CPU
model.to(device)

# Define training arguments with optimizations for memory and ensuring no fp16 usage
training_args = TrainingArguments(
    output_dir="./bart_summarization",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduce batch size to fit in memory
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Accumulate gradients over multiple steps
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    logging_dir="./logs",
    fp16=False,  # Ensure fp16 is disabled to avoid issues with MPS
    no_cuda=True,  # Disable CUDA (which is for GPU) completely
)

# Data collator for Seq2Seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()


Map: 100%|██████████| 1312/1312 [00:03<00:00, 349.91 examples/s]
Map: 100%|██████████| 146/146 [00:00<00:00, 307.29 examples/s]
  device = torch.device("mps" if torch.has_mps else "cpu")  # Use MPS if available, otherwise fallback to CPU
  loaded_dict = _convert_str_dict(loaded_dict)
  
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.320352
2,No log,0.303818
3,No log,0.319727


  


TrainOutput(global_step=492, training_loss=0.31143762231842287, metrics={'train_runtime': 7631.6303, 'train_samples_per_second': 0.516, 'train_steps_per_second': 0.064, 'total_flos': 4264861856956416.0, 'train_loss': 0.31143762231842287, 'epoch': 3.0})

In [117]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./bert_summarization_finetune/bbc_bart_summarization")
tokenizer.save_pretrained("./bert_summarization_finetune/bbc_bart_summarization")

('./bert_summarization_finetune/bbc_bart_summarization/tokenizer_config.json',
 './bert_summarization_finetune/bbc_bart_summarization/special_tokens_map.json',
 './bert_summarization_finetune/bbc_bart_summarization/vocab.json',
 './bert_summarization_finetune/bbc_bart_summarization/merges.txt',
 './bert_summarization_finetune/bbc_bart_summarization/added_tokens.json')

In [156]:
# Function to generate summaries
def generate_summary(article_text, model, tokenizer):
    inputs = tokenizer(article_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(model.device)
    with torch.no_grad():
        summary_ids = model.generate(inputs.input_ids, max_length=350, min_length=50, length_penalty=2.0, num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [157]:
#tokenizer = BartTokenizer.from_pretrained("./bert_summarization_finetune/bbc_bart_summarization")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Load the fine-tuned model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
# Test on a sample article from the test dataset
sample_article = dataset["train"][0]["Article"]
sample_summary=  dataset["train"][0]["Summary"]
generated_summary = generate_summary(sample_article, model, tokenizer)

print("Original Article:", sample_article)
print("\nGenerated Summary:", generated_summary)
print("\nSample Summary:", sample_summary)

Original Article: Musicians to tackle US red tape

Musicians' groups are to tackle US visa regulations which are blamed for hindering British acts' chances of succeeding across the Atlantic.

A singer hoping to perform in the US can expect to pay $1,300 (£680) simply for obtaining a visa. Groups including the Musicians' Union are calling for an end to the "raw deal" faced by British performers. US acts are not faced with comparable expense and bureaucracy when visiting the UK for promotional purposes.

Nigel McCune from the Musicians' Union said British musicians are "disadvantaged" compared to their US counterparts. A sponsor has to make a petition on their behalf, which is a form amounting to nearly 30 pages, while musicians face tougher regulations than athletes and journalists. "If you make a mistake on your form, you risk a five-year ban and thus the ability to further your career," says Mr McCune.

"The US is the world's biggest music market, which means something has to be done 

In [149]:

# Load the BBC news dataset (ensure it's correctly loaded)
dataset = load_dataset("csv", data_files="bbc_news_summary_with_articles.csv")
# Load the pre-trained BART tokenizer
tokenizer = BartTokenizer.from_pretrained("./bert_summarization_finetune/bbc_bart_summarization")
# Load the fine-tuned model
model = BartForConditionalGeneration.from_pretrained("./bert_summarization_finetune/bbc_bart_summarization")
# Se
# Set the device to CPU
device = torch.device("cpu")
model.to(device)

# Preprocess function for tokenizing input articles
def preprocess_function(examples):
    model_inputs = tokenizer(examples["Article"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=100, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset (only for evaluation)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
print(tokenized_datasets)
# Function to generate summaries using the model
def generate_summary(article_text, model, tokenizer):
    inputs = tokenizer(article_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(device)  # Ensure the inputs are on the CPU
    with torch.no_grad():
        summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=50, length_penalty=2.0, num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Evaluate manually and compute ROUGE scores
def evaluate_model(dataset, model, tokenizer):
    predictions = []
    references = []
    
    for article, summary in zip(dataset["Article"], dataset["Summary"]):
        generated_summary = generate_summary(article, model, tokenizer)
        predictions.append(generated_summary)
        references.append(summary)
    
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE scores
    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_scores["rouge1"].append(scores['rouge1'].fmeasure)
        rouge_scores["rouge2"].append(scores['rouge2'].fmeasure)
        rouge_scores["rougeL"].append(scores['rougeL'].fmeasure)

    # Average the scores
    avg_scores = {key: sum(value) / len(value) for key, value in rouge_scores.items()}
    return avg_scores

# Evaluate on the test set
validation_results = evaluate_model(tokenized_datasets["train"], model, tokenizer)
print("Validation ROUGE Scores:", validation_results)



DatasetDict({
    train: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2225
    })
})


  if max_position_embeddings is not None:


Validation ROUGE Scores: {'rouge1': 0.10676138348564734, 'rouge2': 0.0996934416848245, 'rougeL': 0.0900784962347142}
