In [31]:
from datasets import load_dataset
from langchain_community.vectorstores import FAISS
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
import pandas as pd


In [32]:
import warnings
warnings.filterwarnings("ignore")


Load retriever with a threshold of 0.78 in order to select the fine-tuning data based on related contexts availability.

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_built() else "cpu")
model_kwargs = {"device": "mps"}
encode_kwargs = {"normalize_embeddings": True}
embedding_model = HuggingFaceEmbeddings(
    model_name="thenlper/gte-base",
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

faiss_index_path = f"{'../embeddings/faiss_indices/'}{'thenlper/gte-base'}"
db = FAISS.load_local(faiss_index_path, embedding_model)
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.78},
)

In [34]:
# Load the pubmed_qa dataset
pubmed_dataset = load_dataset("qiaojin/pubmed_qa", "pqa_artificial")

In [35]:
pubmed_dataset

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 211269
    })
})

In [36]:
pubmed_dataset = pubmed_dataset.remove_columns(("pubid", "context", "final_decision"))
pubmed_dataset = pubmed_dataset.rename_column("long_answer", "answer")
pubmed_dataset


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 211269
    })
})

In [37]:
data_for_finetuning = []
MAX_DATA_COUNT = 10000

In [39]:
# Iterate over the dataset, adding only question-answer pairs whose questions have related contexts (similarity score: 0.78)
for data in pubmed_dataset["train"]:
    question = data["question"]
    # Retrieve related contexts
    related_contexts = retriever.invoke(question)
    # print(len(related_contexts) > 0)
    if len(related_contexts) > 0:
        data_for_finetuning.append(data)
        if len(data_for_finetuning) % 1000 == 0:
            print(f"{len(data_for_finetuning)} data were added")
    if MAX_DATA_COUNT <= len(data_for_finetuning):
        break

1000 data were added
2000 data were added
3000 data were added
4000 data were added
5000 data were added
6000 data were added
7000 data were added
8000 data were added
9000 data were added
10000 data were added


Save fine-tuning data

In [40]:
df = pd.DataFrame(data_for_finetuning)
df.to_csv('hypothetical_fine-tuning_data.csv', index=False)  

Load fine-tuning data. Do not forget to convert it to a formatted text for fine-tuning the model

In [44]:
def format_medical_query(data):
  """Formats question-answer data for medical domain fine-tuning.

  Args:
      data: A dictionary containing the question and answer.

  Returns:
      A string with the formatted text.
  """

  instruction = "Provide a concise and informative answer to the following medical query:"
  text_row = f"""<s>[INST] {instruction}[/INST] \nMedical query: {data['question']}  \nAnswer: {data['answer']} </s>"""
  return text_row

In [42]:
loaded_data = pd.read_csv('hypothetical_fine-tuning_data.csv')

In [45]:
print(format_medical_query(loaded_data.iloc[0]))

<s>[INST] Provide a concise and informative answer to the following medical query:[/INST] 
Medical query: Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?  
Answer: As ILC2s are elevated in patients with CRSwNP, they may drive nasal polyp formation in CRS. ILC2s are also linked with high tissue and blood eosinophilia and have a potential role in the activation and survival of eosinophils during the Th2 immune response. The association of innate lymphoid cells in CRS provides insights into its pathogenesis. </s>
