# Loading Everything
We will be loading in our model and dataset

In [None]:
#Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

tokenizer.pad_token = "[PAD]"  # Set a unique padding token
tokenizer.eos_token = "[EOS]"  # Set an end-of-sequence token

In [None]:
import pandas as pd
df = pd.read_csv('csvFiles/combinedDF.csv')
df.head()

# Prepare Fine-tuning
Now that we have our dataset loaded in, we can begin getting the data ready to be input into our

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset

X = df['abstract']
y = df['contribution']

def train_eval_split(X,y):
  length = round(len(X)*0.8)
  X_train = X[:length]
  X_eval = X[length:]
  y_train = y[:length]
  y_eval = y[length:]

  train_dict = {"abstract": X_train, "contribution": y_train}
  eval_dict = {"abstract": X_eval, "contribution": y_eval}

  train_dataset = Dataset.from_dict(train_dict)
  eval_dataset = Dataset.from_dict(eval_dict)

  return train_dataset, eval_dataset

In [None]:
train_dataset, eval_dataset = train_eval_split(X,y)

In [None]:
train_dataset[0]

In [None]:
def tokenize_inputs(example):
  start_prompt = "Summarize the following research abstract. Focus on main contributions. \n\n"
  end_prompt = "\n\n Summary: "

  prompt = [start_prompt + abstract + end_prompt for abstract in example['abstract']]
  example['input_ids'] = tokenizer(prompt,
                                   padding='max_length',
                                   truncation=True,
                                   max_length=512,
                                   return_tensors='pt').input_ids

  example['labels'] = tokenizer(example['contribution'],
                                padding='max_length',
                                truncation=True,
                                max_length=512,
                                return_tensors='pt').input_ids

  return example

tokenized_trainDataset = train_dataset.map(tokenize_inputs, batched=True)
tokenized_trainDataset = tokenized_trainDataset.remove_columns(['abstract','contribution'])

tokenized_evalDataset = eval_dataset.map(tokenize_inputs, batched=True)
tokenized_evalDataset = tokenized_evalDataset.remove_columns(['abstract','contribution'])

# Fine Tune the Model
We will be fine-tuning the model using the custom datasets we created and attached with this project.

In [None]:
!pip install evaluate

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import nltk
import torch
import gc
import numpy as np

# Download required NLTK data
nltk.download('punkt')

# Load the BLEU metric from the evaluate library
bleu_metric = evaluate.load("bleu")

def compute_bleu(eval_preds):
    predictions, references = eval_preds

    # Extract logits if predictions is a tuple
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Convert logits to token IDs if necessary
    if isinstance(predictions, (np.ndarray, torch.Tensor)):
        predictions = predictions.argmax(axis=-1)  # Get the token IDs with highest probabilities

    # Ensure predictions and references are lists
    predictions = predictions.tolist() if isinstance(predictions, (np.ndarray, torch.Tensor)) else predictions
    references = references.tolist() if isinstance(references, (np.ndarray, torch.Tensor)) else references

    # Decode predictions and references into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_refs = tokenizer.batch_decode(references, skip_special_tokens=True)

    # Prepare data for BLEU metric
    bleu_preds = [pred.strip() for pred in decoded_preds]  # Predictions as a list of strings
    bleu_refs = [[ref.strip()] for ref in decoded_refs]  # References as a list of lists of strings

    # Compute BLEU score
    bleu_result = bleu_metric.compute(predictions=bleu_preds, references=bleu_refs)
    return {"bleu": bleu_result["bleu"]}

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",          # Output directory
    num_train_epochs=4,              # Total number of training epochs
    per_device_train_batch_size=4,  # Batch size per device during training
    save_steps=1000,
    save_total_limit=2,              # Limit the total amount of saved checkpoints
    logging_steps=5,                 # Print losses every 5 steps
    learning_rate=1e-5,
    bf16=True,# Enable mixed precision training
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    # predict_with_generate=True,      # Enable text generation for evaluation
    )


trainer = Seq2SeqTrainer(
    model=model,                     # The model to be trained
    args=training_args,              # Training arguments
    train_dataset=tokenized_trainDataset,        # Training dataset
    eval_dataset=tokenized_evalDataset,          # Evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_bleu,
    )

trainer.train()

# Save the fine-tuned model and tokenizer
model_path = "./fine_tuned_BART_summarization"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print("Fine-tuned model saved")

# Clear memory
del model
del tokenizer
del trainer
torch.cuda.empty_cache()
gc.collect()

