In [26]:
import nltk
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from datasets import load_from_disk
# Load dataset (replace with your actual dataset loading code)
dataset = load_from_disk("bbc_dataset") 


In [29]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader
import nltk
nltk.download('punkt')

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Constants
MAX_SENTENCES = 8
SEQ_LENGTH = 128
BATCH_SIZE = 4

# 1. Preprocessing Function
def preprocess_function(examples):
    tokenized_articles = []
    tokenized_masks = []
    labels_list = []

    for article, summary in zip(examples["Article"], examples["extractive_summary"]):
        # Clean inputs
        article = " ".join(article) if isinstance(article, list) else article
        summary = " ".join(summary) if isinstance(summary, list) else summary
        
        # Tokenize sentences
        sentences = nltk.sent_tokenize(article)[:MAX_SENTENCES]
        num_sentences = len(sentences)
        
        # Tokenize all sentences
        tokenized = tokenizer(
            sentences,
            padding="max_length",
            truncation=True,
            max_length=SEQ_LENGTH,
            return_tensors="pt"
        )
        
        # Create labels (1 if sentence is in summary)
        labels = [1 if sent in summary else 0 for sent in sentences]
        labels += [0] * (MAX_SENTENCES - num_sentences)  # Pad labels
        
        # Pad tensors
        padded_input_ids = torch.zeros((MAX_SENTENCES, SEQ_LENGTH), dtype=torch.long)
        padded_attention_mask = torch.zeros((MAX_SENTENCES, SEQ_LENGTH), dtype=torch.long)
        
        padded_input_ids[:num_sentences] = tokenized["input_ids"]
        padded_attention_mask[:num_sentences] = tokenized["attention_mask"]
        
        tokenized_articles.append(padded_input_ids)
        tokenized_masks.append(padded_attention_mask)
        labels_list.append(torch.tensor(labels, dtype=torch.float))

    return {
        "input_ids": tokenized_articles,
        "attention_mask": tokenized_masks,
        "labels": labels_list
    }

# 2. Apply Preprocessing
train_dataset = dataset["train"].map(preprocess_function, batched=True)
val_dataset = dataset["validation"].map(preprocess_function, batched=True)

# Remove unnecessary columns
columns_to_remove = ["Title", "Article", "Summary", "Category", "extractive_summary"]
train_dataset = train_dataset.remove_columns([col for col in columns_to_remove if col in train_dataset.column_names])
val_dataset = val_dataset.remove_columns([col for col in columns_to_remove if col in val_dataset.column_names])

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 3. Custom Collate Function
def collate_fn(batch):
    return {
        "input_ids": torch.stack([item["input_ids"] for item in batch]),
        "attention_mask": torch.stack([item["attention_mask"] for item in batch]),
        "labels": torch.stack([item["labels"] for item in batch])
    }

# 4. Initialize Model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=1,  # Binary classification per sentence
    problem_type="multi_label_classification"
)

# 5. Custom Trainer Class
class SentenceTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # Add **kwargs
        # Reshape inputs: [batch, sentences, seq_len] -> [batch*sentences, seq_len]
        batch_size, num_sentences, seq_len = inputs["input_ids"].shape
        flat_inputs = {
            "input_ids": inputs["input_ids"].view(-1, seq_len),
            "attention_mask": inputs["attention_mask"].view(-1, seq_len)
        }
        
        # Forward pass
        outputs = model(**flat_inputs)
        logits = outputs.logits.view(batch_size, num_sentences)
        
        # BCEWithLogitsLoss for multi-label classification
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, inputs["labels"])
        
        return (loss, outputs) if return_outputs else loss

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

# 7. Create Trainer
# 2. Initialize Trainer with the fixed class
trainer = SentenceTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
)

# 3. Start training
trainer.train()

# 9. Save the final model
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Map: 100%|██████████| 222/222 [00:00<00:00, 1077.55 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,0.3952,0.349781
1000,0.2644,0.339605


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')

In [66]:
import torch

def generate_summary(model, tokenizer, text, device):
    model.eval()
    
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the correct device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        output = model(**inputs)  # Forward pass on MPS
        logits = output.logits.squeeze(-1)  # Extract logits
        
        # Ensure logits are moved to CPU before processing
        logits = logits.cpu()

        # Select sentences using thresholding
        predicted_labels = (logits > 0.5).int()
        
        sentences = text.split(". ")  # Sentence tokenization
        min_length = min(len(sentences), len(predicted_labels))
        # print("Logits:", logits)
        # print("Predicted Labels:", predicted_labels)
        # print("Sentences:", sentences)
        selected_sentences = [sentences[i] for i in range(min_length) if predicted_labels[i] == 1]        
        summary = " ".join(selected_sentences)
        return summary

# Detect MPS device on Mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)  # Move model to MPS


# Example Usage:
text = """
Jamieson issues warning to bigots\n\nScotland's justice minister has warned bigoted soccer fans that she wants to hit them "where it hurts most" by banning them from matches.\n\nCathy Jamieson said exclusion orders are one of a series of measures being considered in the Scottish Executive campaign against sectarianism. She praised Celtic and Rangers for their work in tackling the problem. However, the minister said stopping sectarian abuse associated with Old Firm matches is a key objective. Ms Jamieson was speaking ahead of the third round Scottish Cup clash between the Glasgow clubs at Parkhead on Sunday. The sectarianism long associated with sections of the support from both clubs has become a significant target for the executive. Last week Ms Jamieson and First Minister Jack McConnell met supporters' representatives from both clubs to discuss the issue.\n\nThey plan to hold an anti-sectarian summit next month with officials from the clubs, church leaders, senior police officers and local authority chiefs among those to be invited. Speaking on BBC Radio Scotland's Sunday Live programme, Ms Jamieson described Friday's meeting as "very productive" and said putting the squeeze on the bigots would be a key aim. Ms Jamieson stressed that sectarianism has not been confined to football but it can act as a "trigger" for tensions and violence. Clubs have taken action in the past to ban troublesome fans and supporters' groups expressed their desire to ensure that the game is no longer tainted by the problem.\n\nMs Jamieson said the executive should have a role in tackling the soccer troublemakers. She said: "We can't get away from the fact that in some instances some of the religious hatred that some people try to associate with football boils over into violence. "That is the kind of thing we want to stop and that's the kind of thing supporters' groups are very clear they don't want to be part of either, and they will work with us to try and deal with that."\n\nMs Jamieson praised the police for their action and said: "The police do want to identify whether there are particular individuals who are going over the top and inciting hatred or violence - they will crack down very effectively on them. "We have of course already indicated that we will consider the introduction of banning orders to give additional powers to where there are people who are going over the top, who have made inappropriate behaviour at football matches, to be able to stop them attending the games. "That's the kind of thing that will hit those kind of people where it hurts the most in not allowing them to attend the games," she said. Praising Celtic and Rangers for their efforts, she said: "I don't think there is any doubt that we have seen some positive moves from the clubs. "Both Rangers and Celtic football clubs have been involved in working with the executive to produce, for example, an educational pack for  """
summary = generate_summary(model, tokenizer, text, device)
print("Generated Summary:", summary)



Generated Summary: 

Scotland's justice minister has warned bigoted soccer fans that she wants to hit them "where it hurts most" by banning them from matches.

Cathy Jamieson said exclusion orders are one of a series of measures being considered in the Scottish Executive campaign against sectarianism


In [30]:
# Evaluate the model on the validation dataset
results = trainer.evaluate()

# Print evaluation results (loss, accuracy, etc.)
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.33960482478141785, 'eval_runtime': 14.5197, 'eval_samples_per_second': 15.29, 'eval_steps_per_second': 3.857, 'epoch': 3.0}


In [36]:
# Run evaluation on validation set
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Keys in eval_results:
# - eval_loss: Average loss across validation set
# - eval_runtime: Time taken
# - eval_samples_per_second: Throughput
# - eval_steps_per_second

Evaluation results: {'eval_loss': 0.33960482478141785, 'eval_runtime': 14.9391, 'eval_samples_per_second': 14.86, 'eval_steps_per_second': 3.749, 'epoch': 3.0}


In [37]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert logits to binary predictions (0 or 1)
    preds = (torch.sigmoid(torch.tensor(predictions)) > 0.5).int().numpy()
    
    # Flatten all sentences across all articles
    flat_preds = preds.ravel()
    flat_labels = labels.ravel()
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        flat_labels, flat_preds, average='binary', zero_division=0
    )
    acc = accuracy_score(flat_labels, flat_preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Update trainer initialization:
trainer = SentenceTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,  # Add this line
)

In [58]:
def evaluate_model(dataset, trainer, tokenizer, max_sentences=8):
    """
    Fixed evaluation function that handles:
    - Sample alignment
    - Tensor/list conversions
    - Metric calculations
    """
    # 1. Preprocess dataset to match model inputs
    processed_dataset = dataset.map(
        lambda x: preprocess_function({'Article': [x['Article']], 
                                     'extractive_summary': [x['extractive_summary']]}),
        batched=True,
        remove_columns=dataset.column_names
    )
    
    # 2. Convert to torch tensors
    processed_dataset.set_format(
        type='torch',
        columns=['input_ids', 'attention_mask', 'labels']
    )
    
    # 3. Get predictions in batches
    try:
        predictions = trainer.predict(processed_dataset)
        pred_logits = predictions.predictions
    except Exception as e:
        print(f"Prediction failed: {e}")
        return None
    
    # 4. Convert to binary predictions
    pred_probs = torch.sigmoid(torch.tensor(pred_logits))
    final_preds = (pred_probs > 0.5).int().numpy()
    
    # 5. Get original texts
    articles = dataset["Article"]
    true_summaries = dataset["extractive_summary"]
    
    # 6. Verify alignment
    if len(articles) != len(final_preds):
        print(f"Alignment warning: {len(articles)} articles vs {len(final_preds)} predictions")
        min_len = min(len(articles), len(final_preds))
        articles = articles[:min_len]
        true_summaries = true_summaries[:min_len]
        final_preds = final_preds[:min_len]
    
    # 7. Calculate metrics
    results = {
        'correct': 0,
        'total_predicted': 0,
        'total_actual': 0,
        'total_sentences': 0
    }
    
    for i, (article, true_summary) in enumerate(zip(articles, true_summaries)):
        sentences = nltk.sent_tokenize(str(article))[:max_sentences]
        true_labels = [1 if str(sent) in str(true_summary) else 0 for sent in sentences]
        true_labels += [0] * (max_sentences - len(true_labels))
        
        # Handle prediction alignment
        article_preds = final_preds[i][:len(true_labels)]
        
        for true, pred in zip(true_labels, article_preds):
            results['total_sentences'] += 1
            if pred == 1:
                results['total_predicted'] += 1
                if true == 1:
                    results['correct'] += 1
            if true == 1:
                results['total_actual'] += 1
    
    # 8. Calculate final metrics
    eps = 1e-9
    metrics = {
        'precision': results['correct'] / (results['total_predicted'] + eps),
        'recall': results['correct'] / (results['total_actual'] + eps),
        'f1': 2 * (precision * recall) / (precision + recall + eps),
        'accuracy': results['correct'] / (results['total_sentences'] + eps),
        'num_samples': len(articles)
    }
    
    # 9. Calculate ROUGE scores if possible
    try:
        pred_summaries = generate_summaries(articles, final_preds)
        rouge_scores = calculate_rouge(pred_summaries, true_summaries)
        metrics.update({
            'rouge1': rouge_scores['rouge1'],
            'rouge2': rouge_scores['rouge2'],
            'rougeL': rouge_scores['rougeL']
        })
    except Exception as e:
        print(f"ROUGE calculation failed: {e}")
    
    return metrics

In [59]:
evaluate_model(dataset["validation"], trainer, tokenizer)

Map: 100%|██████████| 222/222 [00:00<00:00, 4184.67 examples/s]


Prediction failed: Found input variables with inconsistent numbers of samples: [8, 1]


In [50]:
def evaluate_sentence_selection(articles, true_summaries, preds, tokenizer):
    results = {
        'correct_selections': 0,
        'total_selections': 0,
        'total_sentences': 0,
        'total_relevant': 0  # Added counter for relevant sentences
    }
    
    for article, true_summary, article_preds in zip(articles, true_summaries, preds):
        # Convert to string in case inputs aren't text
        article = str(article)
        true_summary = str(true_summary)
        
        sentences = nltk.sent_tokenize(article)[:MAX_SENTENCES]
        summary_sentences = nltk.sent_tokenize(true_summary)
        
        # Create binary labels (1 if sentence is in summary)
        true_labels = [1 if sent in true_summary else 0 for sent in sentences]
        
        # Pad to MAX_SENTENCES if needed
        true_labels += [0] * (MAX_SENTENCES - len(true_labels))
        
        # Ensure predictions match length
        article_preds = article_preds[:len(true_labels)]
        
        # Update counters
        for true, pred in zip(true_labels, article_preds):
            results['total_sentences'] += 1
            if pred == 1:
                results['total_selections'] += 1
                if true == 1:
                    results['correct_selections'] += 1
            if true == 1:
                results['total_relevant'] += 1
    
    # Calculate metrics with epsilon to avoid division by zero
    eps = 1e-9
    precision = results['correct_selections'] / (results['total_selections'] + eps)
    recall = results['correct_selections'] / (results['total_relevant'] + eps)  # Fixed recall calculation
    f1 = 2 * (precision * recall) / (precision + recall + eps)
    
    return {
        'sentence_precision': precision,
        'sentence_recall': recall,
        'sentence_f1': f1,
        'selection_accuracy': results['correct_selections'] / (results['total_sentences'] + eps),
        'num_samples': len(articles)
    }

In [53]:
from rouge_score import rouge_scorer

def calculate_rouge(predicted_summaries, reference_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []
    
    for pred, ref in zip(predicted_summaries, reference_summaries):
        if isinstance(pred, list):
            pred = ' '.join(pred)
        if isinstance(ref, list):
            ref = ' '.join(ref)
        scores.append(scorer.score(ref, pred))
    
    # Average scores
    avg_scores = {
        'rouge1': {'precision':0, 'recall':0, 'fmeasure':0},
        'rouge2': {'precision':0, 'recall':0, 'fmeasure':0},
        'rougeL': {'precision':0, 'recall':0, 'fmeasure':0}
    }
    
    for score in scores:
        for key in avg_scores:
            for metric in ['precision', 'recall', 'fmeasure']:
                avg_scores[key][metric] += score[key][metric]
    
    for key in avg_scores:
        for metric in ['precision', 'recall', 'fmeasure']:
            avg_scores[key][metric] /= len(scores)
    
    return avg_scores

# Generate predicted summaries
def generate_summaries(articles, preds):
    summaries = []
    for article, article_preds in zip(articles, preds):
        sentences = nltk.sent_tokenize(article)[:MAX_SENTENCES]
        selected = [sent for sent, pred in zip(sentences, article_preds[:len(sentences)]) if pred == 1]
        summaries.append(' '.join(selected))
    return summaries
test_articles=dataset["test"]['Article']
test_summaries=dataset["test"]['extractive_summary']
final_preds=dataset["test"]['extractive_summary']
pred_summaries = generate_summaries(test_articles, final_preds)
rouge_scores = calculate_rouge(pred_summaries, test_summaries)
print("ROUGE scores:", rouge_scores)

TypeError: tuple indices must be integers or slices, not str

In [52]:
import matplotlib.pyplot as plt

def plot_metrics(metrics_dict):
    fig, ax = plt.subplots(1, 3, figsize=(15,5))
    
    # Precision-Recall-F1
    metrics = ['precision', 'recall', 'f1']
    values = [metrics_dict[m] for m in metrics]
    ax[0].bar(metrics, values)
    ax[0].set_title('Sentence Classification Metrics')
    ax[0].set_ylim(0,1)
    
    # ROUGE Scores
    rouge_types = ['rouge1', 'rouge2', 'rougeL']
    fmeasures = [metrics_dict['rouge'][t]['fmeasure'] for t in rouge_types]
    ax[1].bar(rouge_types, fmeasures)
    ax[1].set_title('ROUGE F1 Scores')
    ax[1].set_ylim(0,1)
    
    # Selection Distribution
    ax[2].pie(
        [metrics_dict['correct_selections'], 
        metrics_dict['incorrect_selections']],
        labels=['Correct', 'Incorrect'],
        autopct='%1.1f%%'
    )
    ax[2].set_title('Selection Accuracy')
    
    plt.tight_layout()
    plt.show()

# Combine all metrics
full_metrics = {
    **metrics,
    'rouge': rouge_scores,
    'correct_selections': results['correct_selections'],
    'incorrect_selections': results['total_selections'] - results['correct_selections']
}
plot_metrics(full_metrics)

NameError: name 'metrics' is not defined