In [1]:
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_from_disk
dataset = load_from_disk("bbc_dataset")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary'],
        num_rows: 1780
    })
    validation: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary'],
        num_rows: 222
    })
    test: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary'],
        num_rows: 223
    })
})


In [3]:


# Load a pre-trained BERTSUM tokenizer
model_name = "bert-base-uncased"  # or use another pre-trained model

import nltk
from transformers import BertTokenizer

# Example tokenizer and MAX_SENTENCES (you can replace with your specific tokenizer and MAX_SENTENCES value)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_SENTENCES = 8

def preprocess_function(examples):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for article, summary in zip(examples["Article"], examples["extractive_summary"]):  # Ensure proper unpacking
        sentences = nltk.sent_tokenize(article)[:MAX_SENTENCES]  # Sentence-level tokenization
        tokenized = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        input_ids = tokenized["input_ids"][:MAX_SENTENCES]  # Truncate to match MAX_SENTENCES
        attention_mask = tokenized["attention_mask"][:MAX_SENTENCES]  # Add attention_mask
        sent_labels = [1 if sent in summary else 0 for sent in sentences]
        
        # Ensure labels match MAX_SENTENCES
        sent_labels += [0] * (MAX_SENTENCES - len(sent_labels)) if len(sent_labels) < MAX_SENTENCES else sent_labels[:MAX_SENTENCES]

        input_ids_list.append(input_ids.squeeze(0).tolist())  # Remove batch dimension and convert to list
        attention_mask_list.append(attention_mask.squeeze(0).tolist())  # Remove batch dimension and convert to list
        labels_list.append(sent_labels)

    return {"input_ids": input_ids_list, "attention_mask": attention_mask_list, "labels": labels_list}


# Assuming you have a Hugging Face dataset (e.g., 'dataset')
train_dataset = dataset['train'].map(preprocess_function, batched=True)
val_dataset = dataset['validation'].map(preprocess_function, batched=True)




In [4]:
train_dataset


Dataset({
    features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1780
})

In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the BERTSUM model
model = BertForSequenceClassification.from_pretrained(model_name)

# Configure the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=False  # Allow unused columns to be ignored
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import evaluate

# Load the ROUGE metric using the new evaluate library
rouge_metric = evaluate.load("rouge")

# Define a function to compute the ROUGE score
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    # Compute ROUGE scores
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result


In [7]:
from torch.utils.data import Dataset
import torch
from transformers import BertTokenizer

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data  # Custom data (e.g., list of dicts)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Tokenize your data here (e.g., Article as the input and Summary as the label)
        inputs = self.tokenizer(
            item['Article'], 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors='pt'
        )
        
        # Return the tokenized inputs and the label
        return {
            'input_ids': inputs['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': inputs['attention_mask'].squeeze(0),  # Remove the batch dimension
            'labels': torch.tensor(item['extractive_summary'], dtype=torch.long)  # Assuming 'Summary' is the target
        }
#train_dataset=CustomDataset(train_dataset, tokenizer, MAX_SENTENCES)
#val_dataset=CustomDataset(val_dataset,tokenizer, MAX_SENTENCES)

In [8]:
# Drop unwanted keys from each dictionary

train_dataset = train_dataset.remove_columns(['Title', 'Category','Article', 'Summary', 'extractive_summary'])
val_dataset=val_dataset.remove_columns(['Title', 'Category','Article', 'Summary', 'extractive_summary'])

print(train_dataset)



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1780
})


In [23]:
print(train_dataset)
print(val_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1780
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 222
})


In [14]:
print(type(val_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [19]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [15]:
from torch.utils.data import DataLoader

# Wrap data in a DataLoader to see how it's batched
train_loader = DataLoader(train_dataset, batch_size=4)

for batch in train_loader:
    print(batch)  # Print first batch and exit
    break

{'input_ids': [[tensor([101, 101, 101, 101]), tensor([ 6175,  4966, 21864,  7842]), tensor([ 3385,  6100,  5705, 16294]), tensor([3314, 3860, 4014, 3248]), tensor([ 5432, 13949,  6299,  2091]), tensor([ 2000, 22477,  5829, 13411]), tensor([2502, 2097, 2005, 8069]), tensor([12868,  2022, 18451,  4397]), tensor([ 3885,  6211, 26745,  1011]), tensor([ 1005,  2000,  2140, 10249]), tensor([1055, 6100, 6661, 2827]), tensor([3425, 4283, 1997, 2330]), tensor([2704, 2000, 8301, 3410]), tensor([ 2038,  2047,  2015, 13955]), tensor([ 7420,  3424, 18451,  2102]), tensor([ 2502,  1011, 26745,  7842]), tensor([27428, 24386,  2140, 16294]), tensor([4715, 5761, 1010, 2038]), tensor([ 4599, 14917,  1996,  5451]), tensor([2008, 2011, 2088, 2041]), tensor([2016, 6100, 1005, 2151]), tensor([4122, 3860, 1055, 3382]), tensor([2000, 3813, 2922, 1997]), tensor([ 2718, 26632,  8301,  3045]), tensor([ 2068, 17084,  1011, 13411]), tensor([1000, 1012, 9338, 1999]), tensor([2073,  102, 1010, 1996]), tensor([2009, 

In [26]:

from transformers import DataCollatorWithPadding

# Define the model (BERT in this case)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define the data collator (for padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def custom_collate_fn(features):
    batch = {
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "labels": torch.stack([f["labels"] for f in features]),
    }
    return batch
train_loader = DataLoader(
    train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator, # Debugging wrapper
)

trainer.train()  # This will print what is actually received by Trainer



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: too many values to unpack (expected 2)

In [21]:
from inspect import signature
print(signature(model.forward))

(input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None) -> Union[Tuple[torch.Tensor], transformers.modeling_outputs.SequenceClassifierOutput]


In [None]:
# Evaluate the model
results = trainer.evaluate()

# Print the results (ROUGE scores)
print("Validation ROUGE Scores:", results)


In [None]:
# Save the model and tokenizer
model.save_pretrained('./bertsum_finetuned')
tokenizer.save_pretrained('./bertsum_finetuned')
