In [1]:
import sys
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
cache_dir = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read()) + '/cache'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET = "LeoZotos/immu_full"

In [3]:
hf_api_key = ""
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
sentence_bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
data = load_dataset(DATASET, split='train', token=hf_api_key, cache_dir=cache_dir)

Generating train split: 100%|██████████| 843/843 [00:00<00:00, 4953.44 examples/s]


In [6]:
def print_similarities(similarities, sentences1, sentences2):
    for idx_i, sentence1 in enumerate(sentences1):
        print(sentence1)
        for idx_j, sentence2 in enumerate(sentences2):
            print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")


def classify_docs_per_distractor(row, sentence_bert_model):
    docs_per_choice = {}
    for key in ['Answer_A', 'Answer_B', 'Answer_C', 'Answer_D']:
        if row[key] != "":
                docs_per_choice[key + '_docs'] = []
    choices_keys = [key[:-5] for key in docs_per_choice.keys()]
    choices_content = [row[key] for key in choices_keys if row[key] != ""]
    embeddings_choices = sentence_bert_model.encode(choices_content)
    embeddings_docs = sentence_bert_model.encode(row['Relevant_Docs_Simple'])
    similarities = sentence_bert_model.similarity(embeddings_choices, embeddings_docs)

    # might be useful for debugging
    # print_similarities(similarities, choices_content, row['relevant_docs_simple']) 
    
    # We now add each doc to the choice with the highest similarity
    for i, doc in enumerate(row['Relevant_Docs_Simple']):
        max_sim_index = np.argmax(similarities[:, i])
        max_choice = list(docs_per_choice.keys())[max_sim_index]
        docs_per_choice[max_choice].append(doc)
    
    return docs_per_choice


In [7]:
column_names = [f"Answer_{choice}_docs" for choice in ['A', 'B', 'C', 'D']]
docs_by_choice = {name: [] for name in column_names}

for row in tqdm(data):
    docs_per_choice_for_row = classify_docs_per_distractor(row, sentence_bert_model)
    for name in column_names:
        docs_by_choice[name].append(docs_per_choice_for_row.get(name, []))
        
if column_names[0] in data.column_names:
    data = data.remove_columns(column_names)
    
for name, column_data in docs_by_choice.items():
    data = data.add_column(name, column_data)


# # -----
# Answer_A_docs, Answer_B_docs, Answer_C_docs, Answer_D_docs = [], [], [], [] # each will be a column

# for row in tqdm(data):
#     docs_per_choice_for_row = classify_docs_per_distractor(row, sentence_bert_model)
#     Answer_A_docs.append(docs_per_choice_for_row.get('Answer_A_docs', []))
#     Answer_B_docs.append(docs_per_choice_for_row.get('Answer_B_docs', []))
#     Answer_C_docs.append(docs_per_choice_for_row.get('Answer_C_docs', []))
#     Answer_D_docs.append(docs_per_choice_for_row.get('Answer_D_docs', []))

# # check if columns exist, if they do, delete first
# if 'Answer_A_docs' in data.column_names:
#     data = data.remove_columns(['Answer_A_docs', 'Answer_B_docs', 'Answer_C_docs', 'Answer_D_docs'])
# data = data.add_column("Answer_A_docs", Answer_A_docs)
# data = data.add_column("Answer_B_docs", Answer_B_docs)
# data = data.add_column("Answer_C_docs", Answer_C_docs)
# data = data.add_column("Answer_D_docs", Answer_D_docs)

  0%|          | 0/843 [00:00<?, ?it/s]

100%|██████████| 843/843 [00:12<00:00, 67.03it/s]


In [8]:
# Inspect an instance manually to see if it makes sense
id = 22
print(data[id]['Question_With_Options'], ":", "\n A:", data[id]['Answer_A_Docs'], "\n B:", data[id]['Answer_B_Docs'], "\n C:", data[id]['Answer_C_Docs'], "\n D:", data[id]['Answer_D_Docs'])

M cells are crucial for the initiation of mucosal immunity. Where can these M cells be found?
A) In Peyer's patches
B) In GALT and in between intestinal epithelial cells
C) In parts of the entire ileum that have contact with mesenteric lymph nodes
D) In lymph nodes
 : 
 A: [] 
 B: ['Mucus A layer of mucus along the inner walls of the stomach is vital to protect the cell linings from the highly acidic environment inside the stomach.  Mucus is not digested in the intestinal tract.', 'Duodenal cancer The duodenum is the first part of the small intestine. It is located between the stomach and the jejunum. After foods combine with stomach acid, they go down into the duodenum where they mix with bile from the gall bladder and digestive juices from the pancreas.', 'Mucous membrane A mucous membrane (or mucosae; singular mucosa) is a skin-like lining. A mucus membrane is covered in epithelium. They secrete mucus, and in the alimentary canal they absorb nutrients. They line cavities that are ex

In [9]:
# upload to hf
data.push_to_hub(
    repo_id=DATASET,
    commit_message="Classified docs per choice",
    token=hf_api_key,
    private=True
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 21.84ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LeoZotos/immu_full/commit/4df6f3bc9370a4bcd4f36bd651772142ed0aeabb', commit_message='Classified docs per choice', commit_description='', oid='4df6f3bc9370a4bcd4f36bd651772142ed0aeabb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LeoZotos/immu_full', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LeoZotos/immu_full'), pr_revision=None, pr_num=None)