In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer

class SummarizationDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.dataset = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset.iloc[idx, 0]
        summary = self.dataset.iloc[idx, 1]
        
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer.encode_plus(
            summary,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

train_dataset = SummarizationDataset('/home/mohan/infy/data/merged/final/train.csv', tokenizer)
val_dataset = SummarizationDataset('/home/mohan/infy/data/merged/final/validation.csv', tokenizer)
test_dataset = SummarizationDataset('/home/mohan/infy/data/merged/final/test.csv', tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [2]:
import torch
from transformers import BartForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
model = model.to(device)


In [3]:
import time
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=0.001)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()

start_time = time.time()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    epoch_start_time = time.time()
    total_loss = 0
    
    for step, batch in enumerate(train_loader):
        
        batch_start_time = time.time()    
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
               
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.update(1)
        
        total_loss += loss.item()
        
        current_lr = lr_scheduler.get_last_lr()[0]
        batch_time = time.time() - batch_start_time
        print(f"Epoch {epoch + 1} | Step {step + 1}/{len(train_loader)} | "
              f"Batch Loss: {loss.item():.4f} | Learning Rate: {current_lr:.6f} | "
              f"Batch Time: {batch_time:.2f}s")
    
    avg_loss = total_loss / len(train_loader)
    epoch_time = time.time() - epoch_start_time
    print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f} | "
          f"Epoch Time: {epoch_time:.2f}s")
    
total_training_time = time.time() - start_time
print(f"Training completed in {total_training_time:.2f}s")




  0%|          | 0/37626 [00:00<?, ?it/s]

Epoch 1/3
Epoch 1 | Step 1/12542 | Batch Loss: 13.9620 | Learning Rate: 0.001000 | Batch Time: 1.29s
Epoch 1 | Step 2/12542 | Batch Loss: 15.4104 | Learning Rate: 0.001000 | Batch Time: 0.89s
Epoch 1 | Step 3/12542 | Batch Loss: 15.1106 | Learning Rate: 0.001000 | Batch Time: 0.72s
Epoch 1 | Step 4/12542 | Batch Loss: 16.2638 | Learning Rate: 0.001000 | Batch Time: 0.72s
Epoch 1 | Step 5/12542 | Batch Loss: 12.4789 | Learning Rate: 0.001000 | Batch Time: 0.75s
Epoch 1 | Step 6/12542 | Batch Loss: 11.4378 | Learning Rate: 0.001000 | Batch Time: 0.73s
Epoch 1 | Step 7/12542 | Batch Loss: 10.5441 | Learning Rate: 0.001000 | Batch Time: 0.72s
Epoch 1 | Step 8/12542 | Batch Loss: 9.8666 | Learning Rate: 0.001000 | Batch Time: 0.72s
Epoch 1 | Step 9/12542 | Batch Loss: 44.8217 | Learning Rate: 0.001000 | Batch Time: 0.72s
Epoch 1 | Step 10/12542 | Batch Loss: 8.7507 | Learning Rate: 0.001000 | Batch Time: 0.73s
Epoch 1 | Step 11/12542 | Batch Loss: 7.2297 | Learning Rate: 0.001000 | Batch Ti

KeyboardInterrupt: 

In [None]:
model.eval()
total_eval_loss = 0
eval_start_time = time.time()

with torch.no_grad():
    for step, batch in enumerate(val_loader):
        batch_start_time = time.time()
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_eval_loss += loss.item()
        
        batch_time = time.time() - batch_start_time
        print(f"Validation Step {step + 1}/{len(val_loader)} | "
              f"Batch Loss: {loss.item():.4f} | Batch Time: {batch_time:.2f}s")

avg_eval_loss = total_eval_loss / len(val_loader)

eval_time = time.time() - eval_start_time

print(f"Validation Loss: {avg_eval_loss:.4f} | Evaluation Time: {eval_time:.2f}s")


In [5]:
model.save_pretrained("/home/mohan/infy/models/fine_tuned_bart")
tokenizer.save_pretrained("/home/mohan/infy/models/fine_tuned_bart")


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('/home/mohan/infy/models/fine_tuned_bart/tokenizer_config.json',
 '/home/mohan/infy/models/fine_tuned_bart/special_tokens_map.json',
 '/home/mohan/infy/models/fine_tuned_bart/vocab.json',
 '/home/mohan/infy/models/fine_tuned_bart/merges.txt',
 '/home/mohan/infy/models/fine_tuned_bart/added_tokens.json')