# Loading Data

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score
from transformers import get_scheduler
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import numpy as np
import matplotlib.pyplot as plt
import pickle
from torch.optim import AdamW
from tqdm import tqdm
import time


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
train_df = pd.read_json('../data/train.json')
val_df = pd.read_json('../data/val.json')
test_df = pd.read_json('../data/test.json')

In [3]:
# How long did this take
start_time = time.time()
print("Training started at:", time.strftime("%H:%M:%S", time.localtime()))

Training started at: 23:23:54


# Preprocessing Data

In [4]:
# Function to concatenate references
def concatenate_references(row):
    references = []
    num_refs = int(row.get('num_references', 0))  # Avoid KeyError
    for i in range(num_refs):
        ref_key = f'ref_abstract.cite_{i}.abstract'
        if ref_key in row:  # Ensure key exists
            references.append(row.get(ref_key, ''))  # Avoid KeyError
    return ' '.join(references).strip()

In [5]:
# Apply function to create a new column with concatenated references
train_df['references'] = train_df.apply(concatenate_references, axis=1)
val_df['references'] = val_df.apply(concatenate_references, axis=1)
test_df['references'] = test_df.apply(concatenate_references, axis=1)

# Bert Tokenization

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
MAX_LENGTH = 512


# Function to tokenize text
def tokenize_function(text):
    return tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )


# Custom Dataset Class
class ReferenceDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]

        # Tokenize abstract and references
        abstract_tokens = tokenize_function(row['abstract'])
        references_tokens = tokenize_function(row['references'])

        return {
            "input_ids": abstract_tokens["input_ids"].squeeze(0),
            "attention_mask": abstract_tokens["attention_mask"].squeeze(0),
            "labels": references_tokens["input_ids"].squeeze(0)  # Treat references as labels
        }


# Create dataset instances
train_dataset = ReferenceDataset(train_df)
val_dataset = ReferenceDataset(val_df)
test_dataset = ReferenceDataset(test_df)

# Create DataLoaders
BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model Preparation

In [7]:
# pip install sentencepiece

In [8]:
# pip install hf_xet

In [9]:
# Load T5 tokenizer & model
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Define maximum token lengths
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 128

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
class ReferenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length, max_output_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]

        # Tokenize abstract (input)
        input_encoding = self.tokenizer(
            row["abstract"],
            padding="max_length",
            truncation=True,
            max_length=self.max_input_length,
            return_tensors="pt"
        )

        # Tokenize references (target output)
        target_encoding = self.tokenizer(
            row["references"],
            padding="max_length",
            truncation=True,
            max_length=self.max_output_length,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding["input_ids"].squeeze(0),
            "attention_mask": input_encoding["attention_mask"].squeeze(0),
            "labels": target_encoding["input_ids"].squeeze(0)
        }


# Create dataset instances
train_dataset = ReferenceDataset(train_df, tokenizer, MAX_INPUT_LENGTH, MAX_OUTPUT_LENGTH)
val_dataset = ReferenceDataset(val_df, tokenizer, MAX_INPUT_LENGTH, MAX_OUTPUT_LENGTH)
test_dataset = ReferenceDataset(test_df, tokenizer, MAX_INPUT_LENGTH, MAX_OUTPUT_LENGTH)

# Create DataLoaders
BATCH_SIZE = 8

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# Model Training

In [11]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define learning rate scheduler
num_training_steps = len(train_loader) * 1
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [12]:
EPOCHS = 1

for epoch in range(EPOCHS):
    model.train()
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Print loss
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

  0%|          | 0/3797 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1: 100%|██████████| 3797/3797 [2:02:38<00:00,  1.94s/it, loss=4.28e-6]      


In [13]:
def evaluate(model, val_loader, device):
    model.eval()
    val_loss = 0
    val_steps = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, leave=True):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            val_steps += 1

            # Get predictions
            preds = torch.argmax(outputs.logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    avg_val_loss = val_loss / val_steps
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Loss: {avg_val_loss}")
    print(f"Validation Accuracy: {accuracy}")

    return avg_val_loss, accuracy


evaluate(model, val_loader, device)

100%|██████████| 634/634 [03:43<00:00,  2.84it/s]

Validation Loss: 1.0160178238103768e-05
Validation Accuracy: 0.9996052112120016





(1.0160178238103768e-05, 0.9996052112120016)

- validation loss is very low (3.82e-05)
- validation accuracy is very high (99.84%)
- performing exceptionally well on the validation dataset

In [19]:
def find_learning_rate(model, train_loader, device, start_lr=1e-7, end_lr=1, num_steps=100):
    model.train()
    optimizer = AdamW(model.parameters(), lr=start_lr)
    lr_factor = (end_lr / start_lr) ** (1 / num_steps)
    learning_rates = []
    losses = []

    best_loss = float('inf')
    print("Finding optimal learning rate...")
    max_steps = min(num_steps, len(train_loader))
    loop = tqdm(train_loader, total=max_steps)

    for i, batch in enumerate(loop):
        if i >= max_steps:
            break

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        try:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            current_lr = optimizer.param_groups[0]['lr']
            learning_rates.append(current_lr)

            current_loss = loss.item()

            if current_loss > 4 * best_loss:
                break

            if current_loss < best_loss:
                best_loss = current_loss

            losses.append(current_loss)

            for param_group in optimizer.param_groups:
                param_group['lr'] *= lr_factor

            loop.set_description(f"LR: {current_lr:.8f}, Loss: {current_loss:.6f}")
        except Exception as e:
            print(f"Error during learning rate finding: {e}")
            break

    min_len = min(len(learning_rates), len(losses))
    return learning_rates[:min_len], losses[:min_len]

In [20]:
def plot_learning_rate(learning_rates, losses):
    plt.figure(figsize=(10, 6))

    min_len = min(len(learning_rates), len(losses))
    learning_rates = learning_rates[:min_len]
    losses = losses[:min_len]

    plt.semilogx(learning_rates, losses)
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    plt.title('Learning Rate Finder')
    plt.grid(True)

    if len(losses) > 2:
        try:
            gradients = np.gradient(losses)
            min_gradient_idx = np.argmin(gradients)

            if 0 <= min_gradient_idx < len(learning_rates):
                optimal_lr = learning_rates[min_gradient_idx]
                plt.axvline(x=optimal_lr, color='r', linestyle='--')
                plt.text(optimal_lr, min(losses), f'Suggested LR: {optimal_lr:.8f}',
                         verticalalignment='bottom', horizontalalignment='right')
            else:
                optimal_lr = learning_rates[len(learning_rates) // 2]
        except Exception as e:
            print(f"Error calculating gradient: {e}")
            optimal_lr = learning_rates[len(learning_rates) // 2]
    else:
        print("Not enough data points to calculate gradient")
        optimal_lr = learning_rates[0] if learning_rates else 1e-4

    plt.savefig('learning_rate_finder.png')
    plt.close()

    print(f"Selected learning rate: {optimal_lr:.8f}")
    return optimal_lr

In [21]:
def save_model_to_pickle(model, tokenizer, file_path="t5_reference_model.pkl"):
    """
    Save the model and tokenizer to a pickle file
    """

    model = model.to("cpu")  # Move model to CPU before saving. needed bc GPU cannot be pickled

    model_data = {
        "model": model,
        "tokenizer": tokenizer,
        "model_name": "t5-small",
        "max_input_length": 512,
        "max_output_length": 128
    }

    # Save to pickle file
    with open(file_path, "wb") as f:
        pickle.dump(model_data, f)

    print(f"Model saved to {file_path}")

    return file_path

In [22]:
def load_model_from_pickle(file_path="t5_reference_model.pkl"):
    """
    Load the model and tokenizer from a pickle file
    """
    with open(file_path, "rb") as f:
        model_data = pickle.load(f)

    model = model_data["model"]
    tokenizer = model_data["tokenizer"]

    return model, tokenizer, model_data

## Learning Rate Finder

In [23]:
print("Running learning rate finder...")
learning_rates, losses = find_learning_rate(model, train_loader, device)
optimal_lr = plot_learning_rate(learning_rates, losses)
print(f"Optimal learning rate found: {optimal_lr:.8f}")
optimizer = AdamW(model.parameters(), lr=optimal_lr)

num_training_steps = len(train_loader) * 3  # For 3 epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training with the optimal learning rate
EPOCHS = 3  # 1 to 3 for better results
training_losses = []
validation_losses = []

for epoch in range(EPOCHS):  # epoch
    model.train()
    epoch_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Update progress
        current_loss = loss.item()
        epoch_loss += current_loss
        loop.set_description(f"Epoch {epoch + 1}/{EPOCHS}")
        loop.set_postfix(loss=current_loss, lr=optimizer.param_groups[0]['lr'])

    avg_train_loss = epoch_loss / len(train_loader)
    training_losses.append(avg_train_loss)

    # Validation phase
    val_loss, val_accuracy = evaluate(model, val_loader, device)
    validation_losses.append(val_loss)

    print(f"Epoch {epoch + 1}/{EPOCHS}:")
    print(f"  Train Loss: {avg_train_loss:.6f}")
    print(f"  Validation Loss: {val_loss:.6f}")
    print(f"  Validation Accuracy: {val_accuracy:.6f}")

Running learning rate finder...
Finding optimal learning rate...


LR: 0.00000010, Loss: 0.000010:   1%|          | 1/100 [00:03<06:14,  3.78s/it]


Not enough data points to calculate gradient
Selected learning rate: 0.00000010
Optimal learning rate found: 0.00000010


Epoch 1/3: 100%|██████████| 3797/3797 [2:30:22<00:00,  2.38s/it, loss=1.72e-5, lr=6.67e-8]      
100%|██████████| 634/634 [03:58<00:00,  2.66it/s]


Validation Loss: 9.363724728300382e-06
Validation Accuracy: 0.9996052112120016
Epoch 1/3:
  Train Loss: 0.000376
  Validation Loss: 0.000009
  Validation Accuracy: 0.999605


Epoch 2/3: 100%|██████████| 3797/3797 [1:50:21<00:00,  1.74s/it, loss=0.0174, lr=3.33e-8]  
100%|██████████| 634/634 [04:06<00:00,  2.57it/s]


Validation Loss: 8.922521432287e-06
Validation Accuracy: 0.9996052112120016
Epoch 2/3:
  Train Loss: 0.000320
  Validation Loss: 0.000009
  Validation Accuracy: 0.999605


Epoch 3/3: 100%|██████████| 3797/3797 [1:58:59<00:00,  1.88s/it, loss=2.21e-6, lr=0]         
100%|██████████| 634/634 [04:06<00:00,  2.57it/s]

Validation Loss: 8.826780238078476e-06
Validation Accuracy: 0.9996052112120016
Epoch 3/3:
  Train Loss: 0.000327
  Validation Loss: 0.000009
  Validation Accuracy: 0.999605





In [24]:
plt.figure(figsize=(10, 6))
plt.plot(training_losses, label='Training Loss')
plt.plot(validation_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.savefig('training_validation_loss.png')
plt.close()

In [25]:
model_path = save_model_to_pickle(model, tokenizer)
print(f"Model saved to {model_path}")

Model saved to t5_reference_model.pkl
Model saved to t5_reference_model.pkl


In [26]:
# commenting out to avoid running each time from when running
# ## Load model from pkle and generate references
# def generate_references(abstract_text, model, tokenizer, device, max_length=128):
#     """
#     Generate references for a given abstract
#     """
#     # Prepare input
#     inputs = tokenizer(
#         abstract_text,
#         padding="max_length",
#         truncation=True,
#         max_length=512,
#         return_tensors="pt"
#     ).to(device)
#
#     # Generate output
#     output_ids = model.generate(
#         input_ids=inputs["input_ids"],
#         attention_mask=inputs["attention_mask"],
#         max_length=max_length,
#         num_beams=4,
#         early_stopping=True
#     )
#
#     # Decode output
#     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#
#     return generated_text

In [27]:
# Ex usage
"""
# Test with a sample abstract
sample_abstract = "This paper explores the use of transformer models for natural language generation tasks."

generated_references = generate_references(
    sample_abstract, model, tokenizer, device
)

print("Sample Abstract:")
print(sample_abstract)
print("\nGenerated References:")
print(generated_references)
"""

# To load and use the saved model:
"""
# Load model from pickle
loaded_model, loaded_tokenizer, _ = load_model_from_pickle("t5_reference_model.pkl")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)

# Gen references with loaded model
sample_abstract = "This paper explores the use of transformer models for natural language generation tasks."
generated_references = generate_references(sample_abstract, loaded_model, loaded_tokenizer, device)
print(generated_references)
"""

'\n# Load model from pickle\nloaded_model, loaded_tokenizer, _ = load_model_from_pickle("t5_reference_model.pkl")\n\n# Move model to device\ndevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")\nloaded_model = loaded_model.to(device)\n\n# Generate references with loaded model\nsample_abstract = "This paper explores the use of transformer models for natural language generation tasks."\ngenerated_references = generate_references(sample_abstract, loaded_model, loaded_tokenizer, device)\nprint(generated_references)\n'

In [28]:
# end
end_time = time.time()
execution_time = end_time - start_time
hours, remainder = divmod(execution_time, 3600)
minutes, seconds = divmod(remainder, 60)

print("Training completed at:", time.strftime("%H:%M:%S", time.localtime()))
print(f"Total execution time: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d} (HH:MM:SS)")
print(f"Total seconds: {execution_time:.2f}")

Training completed at: 13:51:14
Total execution time: 14:27:20 (HH:MM:SS)
Total seconds: 52040.77
