In [33]:
import gdown
import zipfile
import os
import faiss
import datasets
import torch
import typing
from typing import Tuple, Dict
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import load_dataset
import json

from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration, RagConfig, AutoConfig, AutoModel, \
    RagTokenizer, BartForConditionalGeneration, AlbertModel, Trainer, TrainingArguments

from transformers.modeling_outputs import BaseModelOutputWithPooling

from reqs.lightning_base import BaseTransformer
from reqs.utils import parse_sh_args

## Loading the Dataset

In [12]:
url = "https://drive.google.com/uc?id=18xMA2wGPDXArwLyVWN3HXQaF0XnjtugF"
filepath = "data/gold"

# Check if index exists
if os.path.isfile(filepath + "/index.faiss"):
    print("File already exists")
else:

    # Download zip file using gdown
    gdown.download(url, "index.zip", quiet=False)

    # Create directory if it doesn't exist
    if not os.path.exists(filepath):
        os.makedirs(filepath)

    # Unzip file
    with zipfile.ZipFile("index.zip", 'r') as zip_ref:
        zip_ref.extractall(filepath)

    # Remove zip file
    os.remove("index.zip")

File already exists


## Creating the Model

In [13]:
encoder_model_name = "sentence-transformers/paraphrase-albert-base-v2"
encoder_model_type = "albert"
encoder_config = AutoConfig.from_pretrained(encoder_model_name, output_hidden_states=True)

generator_model_name = "facebook/bart-base"
generator_model_type = "bart"
generator_config = AutoConfig.from_pretrained(generator_model_name)

In [14]:
rag_config = RagConfig(
    question_encoder={
        "model_type": encoder_model_type,
        "config": encoder_config,
    },
    generator = {
        "model_type": generator_model_type,
        "config": generator_config
    },
    index_name="custom",
    passages_path=filepath + "/dataset",
    index_path=filepath + "/index.faiss",
)

In [26]:

rag_retriever = RagRetriever(
    config=rag_config,
    question_encoder_tokenizer = AutoTokenizer.from_pretrained(encoder_model_name),
    generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name),
)

In [16]:
class CustomQuestionEncoder(AlbertModel):
    def forward(self, *args, **kwargs):
        # Call the original forward method
        outputs = super().forward(*args, **kwargs)
        attention_mask = kwargs.get('attention_mask', None)

        if attention_mask is None:
            # Assume all 1s if not given, use output to get mask. The final output must be two-dimensional
            attention_mask = torch.ones(outputs[0].shape[:2], device=outputs[0].device)


        token_embeddings = outputs[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        pooler_output = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        # Return pooler output, hidden states and attentions
        return BaseModelOutputWithPooling(pooler_output=pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

    # # Fine tuning code
    # def _step(self, batch: dict) -> Tuple:
    #     source_ids, source_mask, target_ids = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"]

    #     assert self.is_rag_model
    #     generator = self.model.rag.generator
    #     decoder_input_ids = target_ids
    #     lm_labels = decoder_input_ids
    #     rag_kwargs["reduce_loss"] = True

    #     assert decoder_input_ids is not None

    #     outputs = self(
    #         source_ids,
    #         attention_mask=source_mask,
    #         decoder_input_ids=decoder_input_ids,
    #         use_cache=False,
    #         labels=lm_labels,
    #         **rag_kwargs,
    #     )

    #     loss = outputs["loss"]
    #     return (loss,)

    # def training_step(self, batch, batch_idx) -> Dict:
    #     loss_tensors = self._step(batch)

    #     return {"loss": loss_tensors[0]}

    # def validation_step(self, batch, batch_idx) -> Dict:
    #     return self._generative_step(batch)

    # def validation_epoch_end(self, outputs, prefix="val") -> Dict:
    #     self.step_count += 1
    #     losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
    #     loss = losses["loss"]
    #     preds = flatten_list([x["preds"] for x in outputs])
    #     return {"preds": preds, f"{prefix}_loss": loss}

    # def test_step(self, batch, batch_idx):
    #     return self._generative_step(batch)

    # def test_epoch_end(self, outputs):
    #     return self.validation_epoch_end(outputs, prefix="test")

    # def _generative_step(self, batch: dict) -> dict:
    #     batch = BatchEncoding(batch).to(device=self.model.device)
    #     generated_ids = self.model.generate(
    #         batch["input_ids"],
    #         attention_mask=batch["attention_mask"],
    #         do_deduplication=False,  # rag specific parameter
    #         use_cache=True,
    #         min_length=1,
    #         max_length=self.target_lens["val"],
    #     )

    #     preds: List[str] = self.ids_to_clean_text(generated_ids)
    #     target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
    #     loss_tensors = self._step(batch)

    #     summ_len = np.mean(lmap(len, generated_ids))
    #     return summ_len


# Use the custom question encoder
question_encoder_model = CustomQuestionEncoder.from_pretrained(encoder_model_name)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

rag_model = RagSequenceForGeneration(
    config=rag_config,
    retriever=rag_retriever,
    question_encoder=question_encoder_model,
    generator=BartForConditionalGeneration.from_pretrained(generator_model_name),
)

rag_tokenizer = RagTokenizer(
    question_encoder=AutoTokenizer.from_pretrained(encoder_model_name),
    generator=AutoTokenizer.from_pretrained(generator_model_name),
)

## Create dataset

In [18]:
# Create pytorch dataset
class QuestionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

        # Get max length from tokenizer
        self.max_length = 512

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['question']
        question_encoding = self.tokenizer.question_encoder(question, return_tensors="pt")

        return {**question_encoding}

In [19]:


# Example: Load a dataset
questions_dataset = load_dataset("web_questions")

# Accessing data
print(questions_dataset["train"][0])
questions_dataset


# Use pd.json_normalize to convert the JSON to a DataFrame
questions_df = pd.json_normalize(questions_dataset["train"], meta=['url','question', 'answers'])

# Split the dataset into training and validation
train_df, val_df = train_test_split(questions_df, test_size=0.3)

Generating train split: 0 examples [00:00, ? examples/s]

{'qText': 'what is the name of justin bieber brother?', 'qId': 'wqr000000', 'answers': ['Jazmyn Bieber', 'Jaxon Bieber']}


In [20]:
tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
train_dataset = QuestionDataset(train_df, rag_tokenizer)
val_dataset = QuestionDataset(val_df, rag_tokenizer)

Fine tuning

In [21]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="trainer",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    evaluation_strategy="epoch",
)

# Instantiate the Trainer
trainer = Trainer(
    model=question_encoder_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)

trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

## Testing the Model

In [23]:
rag_model.to(device)

question = "What is the capital of the Netherlands"
inputs = rag_tokenizer.question_encoder(question, return_tensors="pt").to(device)

generated = rag_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=50, num_beams=4, early_stopping=False)
generated_string = rag_tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print("Question:", question)
print("Answer:", generated_string)

Question: What is the capital of the Netherlands
Answer: Netherlands / The Netherlands, informally Holland, is a country located in northwestern Europe with overseas territories in the Caribbean. It is the largest of the four constituent countries of the Kingdom of the Netherlands. The Netherlands consists of twelve provinces;


In [28]:
# Adapted from https://github.com/huggingface/transformers/blob/main/examples/research_projects/rag/finetune_rag.py#L97
# We make abuse of global variables here to make the code simpler

class GenerativeQAModule(BaseTransformer):
    mode = "generative_qa"
    loss_names = ["loss"]
    metric_names = ["em"] # exact match
    val_metric = "em"

    def __init__(self, hparams, **kwargs):

        self.retriever = rag_retriever
        prefix = rag_config.question_encoder.prefix

        super().__init__(hparams, config=rag_config, tokenizer=rag_tokenizer, model=rag_model)

In [39]:
hparams = parse_sh_args("fine_tune_rag.sh")
QAModule = GenerativeQAModule(hparams)