In [1]:
# Load model directly
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset, load_from_disk, Dataset

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [2]:
import pandas as pd
df_train = pd.read_csv('datasets/training.csv')
df_val = pd.read_csv('datasets/validating.csv')

In [3]:
# Drop any rows with missing data
cleaned_training_df = df_train.dropna()
cleaned_validate_df = df_val.dropna()

In [4]:
batch_size = 48
max_input = 1024
max_target = 128

In [5]:
# Shuffle and sample from training & validation sets
train_data = cleaned_training_df.sample(n=15000, random_state=56)
val_data = cleaned_validate_df.sample(n=3000, random_state=56)

In [6]:
# reset indexes
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

In [7]:
# convert to Hugging Face dataset from pandas
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [8]:
def preprocess_data(batch_data_to_process):
    
    # Extract articles from the dataset
    inputs = [article for article in batch_data_to_process['article']]
    targets = [summary for summary in batch_data_to_process['highlights']]
    
    # Tokenize articles
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input, 
        padding='max_length', 
        truncation=True
    )

     # Tokenize summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, 
            max_length=max_target, 
            padding='max_length', 
            truncation=True
    )   
    # Set tokenized summaries as labels
    model_inputs['labels'] = labels['input_ids']
    
    # Return preprocessed data
    return model_inputs

In [9]:
# Apply preprocessing to train and validation sets
tokenized_train = train_dataset.map(preprocess_data, batched=True)
tokenized_validation = val_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
args = Seq2SeqTrainingArguments(
    'epoch4_lrate2e_b48_s15000v3000', #save directory
    eval_strategy='steps',
    learning_rate=2e-5,
    num_train_epochs=4,
    warmup_steps=500,
    predict_with_generate=True,
    eval_steps=500,
    logging_steps=500,
    dataloader_num_workers=4,  # Use 4 CPU threads for loading data
    save_total_limit=1,  
    fp16=False #available only with CUDA
)

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,   
    tokenizer=tokenizer
)

trainer.train()


  trainer = Seq2SeqTrainer(


In [11]:
model.save_pretrained('epoch4_lrate2e_b48_s15000v3000')
tokenizer.save_pretrained('epoch4_lrate2e_b48_s15000v3000')

('epoch4_lrate1e_b48_s15000v3000\\tokenizer_config.json',
 'epoch4_lrate1e_b48_s15000v3000\\special_tokens_map.json',
 'epoch4_lrate1e_b48_s15000v3000\\vocab.json',
 'epoch4_lrate1e_b48_s15000v3000\\merges.txt',
 'epoch4_lrate1e_b48_s15000v3000\\added_tokens.json')