In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, EncoderDecoderModel, AdamW
import matplotlib.pyplot as plt
from tqdm import tqdm

# New Imports for BLEU and ROUGE
import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import os

# Ensure NLTK packages are downloaded
nltk.download('punkt')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the datasets
train_data = pd.read_csv("processed_train_data.csv").dropna(subset=['bert_description', 'bert_abstract'])
val_data = pd.read_csv("processed_val_data.csv").dropna(subset=['bert_description', 'bert_abstract'])

# Use a smaller subset of the data to ensure it runs
# train_data = train_data.sample(500, replace=True, random_state=42)
# val_data = val_data.sample(100, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize model (same model, no smaller one)
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.to(device)

# Enable gradient checkpointing to reduce memory footprint
if hasattr(model, 'gradient_checkpointing_enable'):
    model.config.use_cache = False
    model.gradient_checkpointing_enable()

optimizer = AdamW(model.parameters(), lr=3e-5)

class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(
            row['bert_description'],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        labels = self.tokenizer(
            row['bert_abstract'],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0),
        }

batch_size = 2
train_dataset = TextDataset(train_data, tokenizer)
val_dataset = TextDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

epochs = 50  # Fewer epochs just to test stability
train_losses = []
val_losses = []

# Initialize lists to store BLEU and ROUGE scores
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Directory to save models
save_dir = "model_checkpoints"
os.makedirs(save_dir, exist_ok=True)

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0

    for step, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}")):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        labels[labels == tokenizer.pad_token_id] = -100

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        # Clear cache after each step (for debugging)
        torch.cuda.empty_cache()

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}: Avg Train Loss = {avg_train_loss:.4f}")

    # Validation loop (only loss)
    model.eval()
    total_val_loss = 0.0
    references = []
    predictions = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            labels_for_loss = labels.clone()
            labels_for_loss[labels_for_loss == tokenizer.pad_token_id] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_for_loss)
            loss = outputs.loss
            total_val_loss += loss.item()

            # Generate predictions
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512)
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(decoded_preds)
            references.extend(decoded_refs)

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}: Avg Val Loss = {avg_val_loss:.4f}")

    # Calculate BLEU score
    # Tokenize the references and predictions for BLEU
    tokenized_refs = [nltk.word_tokenize(ref.lower()) for ref in references]
    tokenized_preds = [nltk.word_tokenize(pred.lower()) for pred in predictions]
    # For corpus_bleu, references need to be a list of lists
    bleu = corpus_bleu([[ref] for ref in tokenized_refs], tokenized_preds)
    bleu_scores.append(bleu)
    print(f"Epoch {epoch+1}: BLEU Score = {bleu:.4f}")

    # Calculate ROUGE scores
    rouge1_total = 0.0
    rouge2_total = 0.0
    rougeL_total = 0.0
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        rouge1_total += scores['rouge1'].fmeasure
        rouge2_total += scores['rouge2'].fmeasure
        rougeL_total += scores['rougeL'].fmeasure

    avg_rouge1 = rouge1_total / len(references)
    avg_rouge2 = rouge2_total / len(references)
    avg_rougeL = rougeL_total / len(references)

    rouge_scores['rouge1'].append(avg_rouge1)
    rouge_scores['rouge2'].append(avg_rouge2)
    rouge_scores['rougeL'].append(avg_rougeL)

    print(f"Epoch {epoch+1}: ROUGE-1 = {avg_rouge1:.4f}, ROUGE-2 = {avg_rouge2:.4f}, ROUGE-L = {avg_rougeL:.4f}")

    # Save the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        save_path = os.path.join(save_dir, f"model_epoch_{epoch+1}.pt")
        torch.save(model.state_dict(), save_path)
        print(f"Model saved to {save_path}")

# Plotting Loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs+1), train_losses, label="Training Loss", marker='o')
plt.plot(range(1, epochs+1), val_losses, label="Validation Loss", marker='s')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.grid()
plt.show()

# Plotting BLEU Score
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs+1), bleu_scores, label="BLEU Score", marker='^', color='green')
plt.xlabel("Epochs")
plt.ylabel("BLEU Score")
plt.title("BLEU Score Over Epochs")
plt.legend()
plt.grid()
plt.show()

# Plotting ROUGE Scores
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs+1), rouge_scores['rouge1'], label="ROUGE-1", marker='x')
plt.plot(range(1, epochs+1), rouge_scores['rouge2'], label="ROUGE-2", marker='x')
plt.plot(range(1, epochs+1), rouge_scores['rougeL'], label="ROUGE-L", marker='x')
plt.xlabel("Epochs")
plt.ylabel("ROUGE Score")
plt.title("ROUGE Scores Over Epochs")
plt.legend()
plt.grid()
plt.show()


  torch.utils._pytree._register_pytree_node(
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\senth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  torch.utils._pytree._register_pytree_node(
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.2.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.6.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.self.value.we

Epoch 1: Avg Train Loss = 6.6772


Validation Epoch 1/50: 100%|██████████| 50/50 [18:45<00:00, 22.51s/it]


Epoch 1: Avg Val Loss = 5.4404


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Epoch 1: BLEU Score = 0.0000
Epoch 1: ROUGE-1 = 0.0346, ROUGE-2 = 0.0054, ROUGE-L = 0.0292


Training Epoch 2/50: 100%|██████████| 125/125 [01:10<00:00,  1.78it/s]


Epoch 2: Avg Train Loss = 5.1324


Validation Epoch 2/50: 100%|██████████| 50/50 [15:53<00:00, 19.06s/it]


Epoch 2: Avg Val Loss = 4.3330
Epoch 2: BLEU Score = 0.0000
Epoch 2: ROUGE-1 = 0.0000, ROUGE-2 = 0.0000, ROUGE-L = 0.0000


Training Epoch 3/50: 100%|██████████| 125/125 [01:08<00:00,  1.82it/s]


Epoch 3: Avg Train Loss = 4.3352


Validation Epoch 3/50: 100%|██████████| 50/50 [17:38<00:00, 21.16s/it]


Epoch 3: Avg Val Loss = 3.7221
Epoch 3: BLEU Score = 0.0000
Epoch 3: ROUGE-1 = 0.0000, ROUGE-2 = 0.0000, ROUGE-L = 0.0000


Training Epoch 4/50: 100%|██████████| 125/125 [01:11<00:00,  1.75it/s]


Epoch 4: Avg Train Loss = 3.8313


Validation Epoch 4/50: 100%|██████████| 50/50 [36:02<00:00, 43.25s/it]


Epoch 4: Avg Val Loss = 3.2275
Epoch 4: BLEU Score = 0.0000
Epoch 4: ROUGE-1 = 0.0000, ROUGE-2 = 0.0000, ROUGE-L = 0.0000


Training Epoch 5/50: 100%|██████████| 125/125 [01:10<00:00,  1.79it/s]


Epoch 5: Avg Train Loss = 3.3855


Validation Epoch 5/50: 100%|██████████| 50/50 [15:43<00:00, 18.87s/it]


Epoch 5: Avg Val Loss = 2.7459
Epoch 5: BLEU Score = 0.0000
Epoch 5: ROUGE-1 = 0.0000, ROUGE-2 = 0.0000, ROUGE-L = 0.0000
Model saved to model_checkpoints\model_epoch_5.pt


Training Epoch 6/50: 100%|██████████| 125/125 [01:09<00:00,  1.79it/s]


Epoch 6: Avg Train Loss = 3.0059


Validation Epoch 6/50: 100%|██████████| 50/50 [17:47<00:00, 21.35s/it]


Epoch 6: Avg Val Loss = 2.4019
Epoch 6: BLEU Score = 0.0001
Epoch 6: ROUGE-1 = 0.0048, ROUGE-2 = 0.0010, ROUGE-L = 0.0038


Training Epoch 7/50: 100%|██████████| 125/125 [01:08<00:00,  1.81it/s]


Epoch 7: Avg Train Loss = 2.6578


Validation Epoch 7/50: 100%|██████████| 50/50 [15:44<00:00, 18.89s/it]


Epoch 7: Avg Val Loss = 2.0291
Epoch 7: BLEU Score = 0.0000
Epoch 7: ROUGE-1 = 0.0000, ROUGE-2 = 0.0000, ROUGE-L = 0.0000


Training Epoch 8/50: 100%|██████████| 125/125 [01:08<00:00,  1.84it/s]


Epoch 8: Avg Train Loss = 2.3284


Validation Epoch 8/50: 100%|██████████| 50/50 [17:30<00:00, 21.01s/it]


Epoch 8: Avg Val Loss = 1.7063
Epoch 8: BLEU Score = 0.0003
Epoch 8: ROUGE-1 = 0.0043, ROUGE-2 = 0.0012, ROUGE-L = 0.0035


Training Epoch 9/50: 100%|██████████| 125/125 [01:09<00:00,  1.81it/s]


Epoch 9: Avg Train Loss = 2.0504


Validation Epoch 9/50: 100%|██████████| 50/50 [16:09<00:00, 19.39s/it]


Epoch 9: Avg Val Loss = 1.4096
Epoch 9: BLEU Score = 0.0057
Epoch 9: ROUGE-1 = 0.0115, ROUGE-2 = 0.0041, ROUGE-L = 0.0098


Training Epoch 10/50: 100%|██████████| 125/125 [01:10<00:00,  1.77it/s]


Epoch 10: Avg Train Loss = 1.7375


Validation Epoch 10/50:   2%|▏         | 1/50 [00:20<16:25, 20.10s/it]


KeyboardInterrupt: 