# 1 Dataset

In [None]:
!pip install -q datasets

In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader
import datasets
# import evaluate
from sklearn.model_selection import train_test_split
import tqdm

# Load the dataset
dataset = datasets.load_dataset("qiaojin/PubMedQA", "pqa_artificial")

# Preprocessing function
def preprocess_data(example):
    question = example["question"]
    context = example["long_answer"]  # Use long_answer as the context
    final_decision = example["final_decision"]
    answer = f"{final_decision}. {context}"
    return {"question": question, "context": context, "answer": answer}

processed_dataset = dataset.map(preprocess_data, remove_columns=dataset["train"].column_names)
split_dataset = processed_dataset["train"].train_test_split(test_size=0.2)
train_dataset = split_dataset["train"] # Training dataset
temp_dataset = split_dataset["test"]
val_test_split = temp_dataset.train_test_split(test_size=0.5)
val_dataset = val_test_split["train"] # Validation dataset
test_dataset = val_test_split["test"] # Testing dataset

# Define dataloader and collate function
def collate_fn(batch):
    max_context_length = 400  # Truncate contexts manually to avoid tokenizer overflow
    return {
        "questions": [item["question"] for item in batch],
        "contexts": [item["context"][:max_context_length] for item in batch],
        "answers": [item["answer"] for item in batch],
    }

train_dataloader = DataLoader(train_dataset, batch_size=8, collate_fn=collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

# 2 BERT Model

In [None]:
!pip install -q transformers

In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering
# Initialize models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", verbose=False, quiet=True)
bert_model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import warnings
warnings.filterwarnings("ignore", message=".*overflowing tokens.*")


In [None]:
from tqdm import tqdm

# Define training loop
bert_optimizer = torch.optim.AdamW(bert_model.parameters(), lr=5e-5)

def train_bert():
    bert_model.train()
    for batch in tqdm(train_dataloader, desc="Training BERT", leave=True):
        questions = batch["questions"]
        contexts = batch["contexts"]

        # Retrieve relevant context using BERT
        inputs = bert_tokenizer(contexts, questions, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {key: val.to(bert_model.device) for key, val in inputs.items()}
        outputs = bert_model(**inputs)

        # Backpropagation for BERT
        start_scores, end_scores = outputs.start_logits, outputs.end_logits
        loss = (start_scores.mean() + end_scores.mean())  # Simplified loss for illustration
        loss.backward()
        bert_optimizer.step()
        bert_optimizer.zero_grad()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

for epoch in range(1, 4):
    print(f"Epoch {epoch}")
    train_bert()

Epoch 1


Training BERT: 100%|██████████| 21127/21127 [15:44<00:00, 22.37it/s]


Epoch 2


Training BERT: 100%|██████████| 21127/21127 [15:43<00:00, 22.39it/s]


Epoch 3


Training BERT: 100%|██████████| 21127/21127 [15:44<00:00, 22.36it/s]


In [None]:
# Save the model
from google.colab import drive
drive.mount('/content/drive')

bert_model.save_pretrained("/content/drive/MyDrive/rag-bert-2")
bert_tokenizer.save_pretrained("/content/drive/MyDrive/rag-bert-2")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/rag-bert-2/tokenizer_config.json',
 '/content/drive/MyDrive/rag-bert-2/special_tokens_map.json',
 '/content/drive/MyDrive/rag-bert-2/vocab.txt',
 '/content/drive/MyDrive/rag-bert-2/added_tokens.json')

In [None]:
# Load the model
from transformers import BertForSequenceClassification, BertTokenizerFast
from google.colab import drive
drive.mount('/content/drive')

bert_model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/rag-bert-2")
bert_tokenizer = BertTokenizerFast.from_pretrained("/content/drive/MyDrive/rag-bert-2")

# 3 GPT Model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", verbose=False)
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

In [None]:
from tqdm import tqdm

# Define training loop for GPT-2
gpt2_optimizer = torch.optim.AdamW(gpt2_model.parameters(), lr=5e-5)

def train_gpt2():
    count = 0
    gpt2_model.train()
    for batch in tqdm(train_dataloader, desc="Training GPT-2", leave=True, unit="batch"):
        questions = batch["questions"]
        contexts = batch["contexts"]
        answers = batch["answers"]

        # Combine question and context as input and use answer as output
        gpt_inputs = gpt2_tokenizer([q + " " + rc for q, rc in zip(questions, contexts)], return_tensors="pt", padding=True, truncation=True, max_length=512)
        gpt_labels = gpt2_tokenizer(answers, return_tensors="pt", padding=True, truncation=True, max_length=gpt_inputs["input_ids"].shape[1]).input_ids


        if gpt_labels.shape[1] < gpt_inputs["input_ids"].shape[1]:
            count += 1
            padding = gpt_inputs["input_ids"].shape[1] - gpt_labels.shape[1]
            gpt_labels = torch.nn.functional.pad(gpt_labels, (0, padding), value=gpt2_tokenizer.pad_token_id)

        gpt_inputs = {key: val.to(gpt2_model.device) for key, val in gpt_inputs.items()}
        gpt_labels = gpt_labels.to(gpt2_model.device)

        gpt_outputs = gpt2_model(**gpt_inputs, labels=gpt_labels)
        loss = gpt_outputs.loss

        # Backpropagation for GPT-2
        loss.backward()
        gpt2_optimizer.step()
        gpt2_optimizer.zero_grad()
    print(count)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model.to(device)

for epoch in range(1, 4):
    print(f"Epoch {epoch}")
    train_gpt2()

Epoch 1


Training GPT-2: 100%|██████████| 21127/21127 [20:10<00:00, 17.45batch/s]


16515
Epoch 2


Training GPT-2: 100%|██████████| 21127/21127 [20:09<00:00, 17.47batch/s]


16513
Epoch 3


Training GPT-2: 100%|██████████| 21127/21127 [20:08<00:00, 17.48batch/s]

16495





In [None]:
gpt2_model_save_path = "/content/drive/MyDrive/rag-gpt"
gpt2_model.save_pretrained(gpt2_model_save_path)
gpt2_tokenizer.save_pretrained(gpt2_model_save_path)
print(f"Model and tokenizer saved to {gpt2_model_save_path}")

Model and tokenizer saved to /content/drive/MyDrive/rag-gpt


In [None]:
import os

save_path = "/content/drive/MyDrive/rag-gpt-model"
os.makedirs(save_path, exist_ok=True)

# Save GPT-2 model and tokenizer
gpt2_model.save_pretrained(save_path)
gpt2_tokenizer.save_pretrained(save_path)

print(f"Models saved successfully to {save_path}")


Models saved successfully to /content/drive/MyDrive/rag-gpt-model


In [None]:
# Save the model
from google.colab import drive
drive.mount('/content/drive')

gpt2_model.save_pretrained("/content/drive/MyDrive/rag-gpt-model")
gpt2_tokenizer.save_pretrained("/content/drive/MyDrive/rag-gpt-model")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/rag-gpt-model/tokenizer_config.json',
 '/content/drive/MyDrive/rag-gpt-model/special_tokens_map.json',
 '/content/drive/MyDrive/rag-gpt-model/vocab.json',
 '/content/drive/MyDrive/rag-gpt-model/merges.txt',
 '/content/drive/MyDrive/rag-gpt-model/added_tokens.json')

In [None]:
# Load the model
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from google.colab import drive
drive.mount('/content/drive')

gpt2_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/rag-gpt-model")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/rag-gpt-model")

In [None]:
!pip install -q evaluate

In [None]:
import evaluate

# Define the evaluation metric
metric = evaluate.load("f1")

In [None]:
from tqdm import tqdm

# Validation loop
def evaluate():
    bert_model.eval()
    gpt2_model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            questions = batch["questions"]
            contexts = batch["contexts"]
            answers = batch["answers"]

            # Retrieve relevant context using BERT
            inputs = bert_tokenizer(contexts, questions, padding=True, truncation=True, return_tensors="pt")
            inputs = {key: val.to(bert_model.device) for key, val in inputs.items()}
            outputs = bert_model(**inputs)

            # Use start and end logits to extract the most relevant span
            start_scores, end_scores = outputs.start_logits, outputs.end_logits
            start_idx = torch.argmax(start_scores, dim=1)
            end_idx = torch.argmax(end_scores, dim=1)

            retrieved_contexts = []
            for i, context in enumerate(contexts):
                tokens = bert_tokenizer.convert_ids_to_tokens(inputs["input_ids"][i][start_idx[i]:end_idx[i] + 1])
                retrieved_context = bert_tokenizer.convert_tokens_to_string(tokens)
                retrieved_contexts.append(retrieved_context)

            # Generate answers with GPT-2
            gpt_inputs = gpt2_tokenizer([q + " " + rc for q, rc in zip(questions, retrieved_contexts)], return_tensors="pt", padding=True, truncation=True)
            gpt_inputs = {key: val.to(gpt2_model.device) for key, val in gpt_inputs.items()}
            gpt_outputs = gpt2_model.generate(**gpt_inputs, max_length=gpt_inputs["input_ids"].shape[1] + 50)

            preds = [gpt2_tokenizer.decode(output, skip_special_tokens=True) for output in gpt_outputs]
            # print("questions:", questions)
            # print("predictions:", preds)
            all_preds.extend(preds)
            all_labels.extend(answers)

    # Calculate F1 score
    results = metric.compute(predictions=all_preds, references=all_labels)
    print(f"Validation F1 Score: {results['f1']:.4f}")

for epoch in range(1, 4):
    print(f"Epoch {epoch}")
    evaluate()