In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch
from sklearn.model_selection import train_test_split

# Define if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define arguments for training
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to use
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=4,  # batch size for training
    per_device_eval_batch_size=4,   # batch size for evaluation
    num_train_epochs=3,             # number of training epochs
    weight_decay=0.01,              # strength of weight decay
    logging_dir='./logs',           # directory for storing logs
    logging_steps=10,
    report_to="none"  # Avoid logging to external platforms (optional)
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Load dataset from JSON
data = pd.read_json('formatted_dataset.json')

# Split into training and validation datasets
train_df, val_df = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Create a DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})


In [3]:
# Initialize tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)  # Move model to GPU if available

def preprocess_function(examples):
    # Tokenize the inputs and targets
    inputs = tokenizer(examples['Body'], max_length=1024, truncation=True, padding='max_length')
    targets = tokenizer(examples['Abstract'], max_length=150, truncation=True, padding='max_length')
    
    # Create a dictionary with encoded inputs and targets
    model_inputs = inputs
    model_inputs['labels'] = targets['input_ids']
    
    return model_inputs

# Tokenize the dataset
encoded_datasets = datasets.map(preprocess_function, batched=True)


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 19944/19944 [05:47<00:00, 57.45 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2217/2217 [00:39<00:00, 55.51 examples/s]


In [4]:
# Define a simple metric (ROUGE score for summarization)
from datasets import load_metric
metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = metric.compute(predictions=predictions, references=labels)
    return result


  metric = load_metric("rouge")


ValueError: The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.

In [6]:
# Define the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=encoded_datasets['train'],         # training dataset
    eval_dataset=encoded_datasets['validation']#,     # evaluation dataset
    #compute_metrics=compute_metrics       # metrics function
)


In [7]:
# Train the model
trainer.train()


  0%|          | 1/14958 [01:55<478:29:02, 115.17s/it]

ValueError: expected sequence of length 1024 at dim 1 (got 386)

In [None]:
# Generate summaries for some example texts
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU if available
    summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Test the summarization
sample_text = "Your long article or body text goes here."
print(generate_summary(sample_text))
