In [None]:
import os
from datasets import load_dataset
import torch
from tqdm import tqdm

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
cache_dir = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read()) + '/cache'

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
hf_api_key = ""
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
def retrieve_relevant_docs(question_set, passages, batch_size = 1000): 
    """Retrieves the top k relevant documents for each question. Reduce batch_size if out-of-memory/process is killed"""
    question_embs = torch.tensor(question_set['emb'], dtype=torch.bfloat16).to(device)
    
    all_scores = torch.tensor([[] for _ in range(len(question_set))], dtype=torch.bfloat16).to(device)
    all_doc = [[] for _ in range(len(question_set))]

    # Do retrieval. Reduce batch_size if out-of-memory
    tmp_doc = []
    tmp_emb = []
    for passage_id, passage in enumerate(tqdm(passages)):
        tmp_emb.append(passage['emb'])
        tmp_doc.append({"title": passage['title'], "text": passage['text']})

        if ((passage_id+1) % batch_size == 0) or (passage_id+1) == len(passages):
            passage_emb = torch.tensor(tmp_emb, dtype=torch.bfloat16).to(device)
            dot_scores = torch.mm(question_embs, passage_emb.T)
            all_scores = torch.cat((all_scores, dot_scores), 1) 
            all_doc = [i + tmp_doc for i in all_doc]
            all_scores, top_k_hits = torch.topk(all_scores, 20)
            all_doc = [[all_doc[idx][j] for j in i] for idx, i in enumerate(top_k_hits)]

            tmp_doc = []
            tmp_emb = []

    relevant_docs_combined = []
    for i in range(len(all_doc)):
        relevant_docs_combined.append([doc['title'] + " " + doc['text'] for doc in all_doc[i]])
    
    return relevant_docs_combined

In [None]:
question_set = load_dataset("LeoZotos/" + "immu" + "_full", split='train', token = hf_api_key, cache_dir=cache_dir)

passages = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3", 'simple', split="train", cache_dir=cache_dir, token=hf_api_key)

relevant_docs = retrieve_relevant_docs(question_set, passages)

question_set = question_set.add_column("relevant_docs" + "_" + 'simple', relevant_docs)

# upload to hf
question_set.push_to_hub(
    repo_id="LeoZotos/immu_full",
    commit_message="Added relevant documents from Wiki Simple",
    token=hf_api_key,
    private=True
)

Generating train split: 100%|██████████| 843/843 [00:00<00:00, 27701.99 examples/s]
Generating train split: 100%|██████████| 646424/646424 [00:08<00:00, 79052.17 examples/s]
100%|██████████| 646424/646424 [09:24<00:00, 1144.83it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 28.47ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LeoZotos/immu_full/commit/967e13cfe8d94b796e42251b0f471225ff8a290e', commit_message='Added relevant documents from Wiki Simple', commit_description='', oid='967e13cfe8d94b796e42251b0f471225ff8a290e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LeoZotos/immu_full', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LeoZotos/immu_full'), pr_revision=None, pr_num=None)

In [10]:
print(question_set['Question_with_options'][1])
print("-----")
print(question_set['relevant_docs_simple'][1])

Severe Covid19 can be treated with covalescent serum of people who cleared the infection with SARS-CoV-2 successfully.  What does this treatment resemble?
A) Passive vaccination
B) Active vaccination
C) Blood transfusion
D) CAR-T therapy

-----
['Severe acute respiratory syndrome coronavirus 2 There are no fixed cures for the treatment of COVID-19,  but there are various drugs that have been approved for use such as Hydroxychloroquine and Remdesivir which are detailed below. Other antiviral drugs, interferon therapy and combination of anti-viral and interferons are also being experimented on to get the best possible outcome for patients. These treatments are used to reduce the symptoms and to keep the patients comfortable.', 'COVID-19 pandemic Since there is no exact cure for Covid-19, treatment has focused on treating the symptoms of the disease such as giving oxygen and using machines to aid breathing, giving pain killers to relieve pain, supportive treatment such as giving fluids, f