In [1]:
import torch
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Device Selection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Read Data From Excel Files
train_df = pd.read_excel('TrainData.xlsx')
test_df = pd.read_excel('TestData.xlsx')
val_df = pd.read_excel('ValidationData.xlsx')

In [4]:
# Data Preprocessing
def preprocess_text(text):
    return text.strip("[]").replace("'", "").replace("\"", "")

train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['summary'] = test_df['summary'].apply(preprocess_text)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

In [5]:
# Tokenizer and Model Initialization
model_checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

In [6]:
# Put The Data In Form Of Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [7]:
# Data Preprocessing
def get_feature(batch):
    encodings = tokenizer(batch['text'], text_target=batch['summary'], max_length=1024, truncation=True)
    encodings = {'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': encodings['labels']}
    return encodings

In [8]:
train_dataset = train_dataset.map(get_feature, batched=True)
test_dataset = test_dataset.map(get_feature, batched=True)
val_dataset = val_dataset.map(get_feature, batched=True)

Map: 100%|██████████| 96926/96926 [00:32<00:00, 2941.21 examples/s]
Map: 100%|██████████| 13448/13448 [00:05<00:00, 2620.56 examples/s]
Map: 100%|██████████| 14411/14411 [00:04<00:00, 2905.18 examples/s]


In [9]:
# Put The Data In Form Of Torch Tensors
columns = ['input_ids', 'labels', 'attention_mask']
train_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)

In [10]:
# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
# Training Arguments
training_args = TrainingArguments(
    output_dir='bart_summaryflow',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    gradient_accumulation_steps=16
)

In [12]:
# Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [13]:
# Model Training
trainer.train()

  0%|          | 0/1514 [00:00<?, ?it/s]

In [None]:
# Save Model
trainer.save_model('bart_summaryflow_model')

In [None]:
# Evaluate On Test Dataset
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)