In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install faiss-cpu
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz

# Install other dependencies
!pip install torch torchvision torchaudio
!pip install numpy tqdm transformers
!pip install spacy
!pip install jsonlines
!pip install scikit-learn
!pip install pyyaml
!pip install sentence-transformers
!pip install datasets


In [18]:
import os
import zipfile

# Define the path
zip_path = "/content/drive/MyDrive/RAG2/RAG2.zip"
extract_path = "/content/drive/MyDrive/RAG2/"

# Unzip the RAG2 repository
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Check if the extraction is successful
print("Extracted files:", os.listdir(extract_path))


Extracted files: ['RAG2.zip', 'RAG2.ipynb', 'RAG2']


# Step 1: Prepare the data and environment
Before running main.py, we need to have the following:
* MedQA dataset (JSON format)
* Biomedical corpora (PubMed, PMC, CPG, Textbooks) in text format
* A conda environment (retriever.yml)
* FAISS embeddings for the corpora (need to precompute)
* Preprocessed articles (snippets) stored in the corresponding folders

In [12]:
!pip install faiss-cpu scispacy transformers torch numpy tqdm
!pip install spacy
!python -m spacy download en_core_sci_scibert  # SciSpacy model

[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/main.py", line 11, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/spinners.py", line 9, in <module>
    from pip._internal.utils.logging import get_indentation
  File "/usr/

# Step 2: Precompute Embeddings
We need to first download and save embeddings + articles in the corresponding directories:
* /content/RAG2/retriever/embeddings/
* /content/RAG2/retriever/articles/

## 2.1 Preprocessing Articles
* For PubMed abstract (raw corpus >200GB), we can use the pre-computed MedCPT embeddings, eliminating the need to download, embed, or store the corpus. These embeddings enable efficient FAISS index construction.

In [1]:
# Download the MedCPT embeddings of PubMed articles (latest 1M articles to test)
!wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/embeds_chunk_36.npy # these are the embeddings
!wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_36.json # these are the coresponding PMIDs (pubmed article IDs)
!wget https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pubmed_chunk_36.json # these are the PMID content

--2025-02-10 14:27:36--  https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/embeds_chunk_36.npy
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 130.14.250.10, 130.14.250.11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3051946112 (2.8G)
Saving to: ‘embeds_chunk_36.npy’


2025-02-10 14:32:23 (10.2 MB/s) - ‘embeds_chunk_36.npy’ saved [3051946112/3051946112]

--2025-02-10 14:32:23--  https://ftp.ncbi.nlm.nih.gov/pub/lu/MedCPT/pubmed_embeddings/pmids_chunk_36.json
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.12, 130.14.250.13, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11921664 (11M) [application/json]
Saving to: ‘pmids_chunk_36.json’


2025-02-10 14:32:27 (5.38 MB/s) - ‘pmids_chunk_36.json’ saved [11921664/119

## 2.2 Encoding Articles into Embeddings
To generate FAISS embeddings for the articles:

* Load a Transformer-based retriever model (MedCPT Query Encoder) to convert text into dense vectors.
* Store the FAISS index in the embeddings directory.

In [2]:
# Move the embeddings
!mv embeds_chunk_36.npy /content/drive/MyDrive/RAG2/RAG2/retriever/embeddings/pubmed/PubMed_Embeds_36.npy

# Move the corresponding articles
!mv pubmed_chunk_36.json /content/drive/MyDrive/RAG2/RAG2/retriever/articles/pubmed/PubMed_Articles_36.json
!mv pmids_chunk_36.json /content/drive/MyDrive/RAG2/RAG2/retriever/articles/pubmed/PubMed_PMIDs_36.json


# Step 3: Run the Retriever
Key scripts in the retriever folder of RAG2, including:


* main.py: The main script orchestrating retrieval across multiple sources
* retrieve.py: Loads FAISS indices and decodes retrieved documents.
* query_encode.py: Converts input queries into embeddings.
* rerank.py: Reranks retrieved documents using a cross-encoder model.
* retriever.yml: Lists dependencies.


Now that we have precomputed embeddings and stored them in the embeddings/ directory, we can run main.py to retrieve relevant evidence for MedQA queries.


In [8]:
# Install SciSpacy and dependencies
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz

# Install other dependencies
!pip install torch torchvision torchaudio
!pip install numpy tqdm transformers
!pip install spacy
!pip install jsonlines
!pip install scikit-learn
!pip install pyyaml
!pip install sentence-transformers
!pip install datasets



Collecting scispacy
  Downloading scispacy-0.5.5-py3-none-any.whl.metadata (18 kB)
Collecting conllu (from scispacy)
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting nmslib-metabrainz==2.1.3 (from scispacy)
  Downloading nmslib_metabrainz-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (956 bytes)
Collecting pybind11>=2.2.3 (from nmslib-metabrainz==2.1.3->scispacy)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading scispacy-0.5.5-py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nmslib_metabrainz-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading conllu

In [9]:
# Ensure SciSpacy model is downloaded
!python -m spacy download en_core_web_sm

2025-02-10 14:58:01.888620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739199482.134885    3203 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739199482.203254    3203 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 14:58:02.719579: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_we

In [7]:
!python /content/drive/MyDrive/RAG2/RAG2/retriever/main.py --corpus pubmed


2025-02-10 15:26:08.496503: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739201168.516991   10740 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739201168.523206   10740 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 15:26:08.544558: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
query encoding: 100% 1273/1273 [00:17<00:00, 74.81it/s]
pubmed load and add: 100% 1/1 [00:23<00:00, 23.47s/it]
PubMed

In [4]:
!ls -lh /content/drive/MyDrive/RAG2/RAG2/retriever/input/medqa/
!cat /content/drive/MyDrive/RAG2/RAG2/retriever/input/medqa/medqa_llama_cot.json | head -n 20


total 1.3M
-rw------- 1 root root 1.3M Feb 10 14:20 medqa_llama_cot.json
{"question": "A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?", "answer": "Tell the attending that he cannot fail to disclose this mistake", "options": {"A": "Disclose the error to the patient but leave it out of the operative report", "B": "Disclose the error to the patient and put it in the operative report", "C": "Tell the attending that he cannot fail to discl

In [None]:
import faiss
import torch
import numpy as np
import json
from transformers import AutoTokenizer, AutoModel

# building the Faiss index of PubMed articles, let's use the flat inner product index
pubmed_embeds = np.load("embeds_chunk_36.npy")
index = faiss.IndexFlatIP(768)
index.add(pubmed_embeds)

# these are the corresponding pmids for the article embeddings
pmids = json.load(open("pmids_chunk_36.json"))

model = AutoModel.from_pretrained("ncbi/MedCPT-Query-Encoder")
tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Query-Encoder")

queries = [
    "How to treat diabetes with COVID-19?",
    "Are mRNA vaccines safe for children?"
]

with torch.no_grad():
    # tokenize the queries
    encoded = tokenizer(
        queries,
        truncation=True,
        padding=True,
        return_tensors='pt',
        max_length=64,
    )

    # encode the queries (use the [CLS] last hidden states as the representations)
    embeds = model(**encoded).last_hidden_state[:, 0, :]

    # search the Faiss index
    scores, inds = index.search(embeds, k=10)

# print the search results
for idx, query in enumerate(queries):
    print(f"Query: {query}")

    for score, ind in zip(scores[idx], inds[idx]):
        print(f"PMID: {pmids[ind]}; Score: {score}")