In [1]:
import sys
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
cache_dir = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read()) + '/cache'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET = "LeoZotos/bio_full"

In [3]:
hf_api_key = ""
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
sentence_bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
data = load_dataset(DATASET, split='train', token=hf_api_key, cache_dir=cache_dir)

Generating train split: 100%|██████████| 778/778 [00:00<00:00, 7290.53 examples/s]


In [11]:
def print_similarities(similarities, sentences1, sentences2):
    for idx_i, sentence1 in enumerate(sentences1):
        print(sentence1)
        for idx_j, sentence2 in enumerate(sentences2):
            print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")


def classify_docs_per_distractor(row, sentence_bert_model):
    docs_per_choice = {}
    for key in ['Answer_A', 'Answer_B', 'Answer_C', 'Answer_D']:
        if row[key] != "":
                docs_per_choice[key + '_Docs'] = []
    choices_keys = [key[:-5] for key in docs_per_choice.keys()]
    choices_content = [row[key] for key in choices_keys if row[key] != ""]
    embeddings_choices = sentence_bert_model.encode(choices_content)
    embeddings_docs = sentence_bert_model.encode(row['Relevant_Docs_Simple'])
    similarities = sentence_bert_model.similarity(embeddings_choices, embeddings_docs)

    # might be useful for debugging
    # print_similarities(similarities, choices_content, row['relevant_docs_simple']) 
    
    # We now add each doc to the choice with the highest similarity
    for i, doc in enumerate(row['Relevant_Docs_Simple']):
        max_sim_index = np.argmax(similarities[:, i])
        max_choice = list(docs_per_choice.keys())[max_sim_index]
        docs_per_choice[max_choice].append(doc)
    
    return docs_per_choice


In [12]:
column_names = [f"Answer_{choice}_Docs" for choice in ['A', 'B', 'C', 'D']]
docs_by_choice = {name: [] for name in column_names}

for row in tqdm(data):
    docs_per_choice_for_row = classify_docs_per_distractor(row, sentence_bert_model)
    for name in column_names:
        docs_by_choice[name].append(docs_per_choice_for_row.get(name, []))
        
if column_names[0] in data.column_names:
    data = data.remove_columns(column_names)
    
for name, column_data in docs_by_choice.items():
    data = data.add_column(name, column_data)

100%|██████████| 778/778 [00:12<00:00, 63.19it/s]


In [13]:
# Inspect an instance manually to see if it makes sense
id = 22
print(data[id]['Question_With_Options'], ":", "\n A:", data[id]['Answer_A_Docs'], "\n B:", data[id]['Answer_B_Docs'], "\n C:", data[id]['Answer_C_Docs'], "\n D:", data[id]['Answer_D_Docs'])

what is one of the contradictions in "paradoxical" sleep?
a) the brain is very active, while many of the muscles are deeply relaxed
b) subcortical structures are very active, while the cerebral cortex is inactive
c) the frequency of the brain waves is low, while the amplitude is high
d) postural muscles are tense, while heart rate and breathing rate are very low
 : 
 A: ['Sleep Stage 4: The dreaming stage in which brain waves are more vigorous with rapid eye movement. Awakenings are more common in REM (Rapid Eye Movement) sleep as opposed to NREM.', 'Sleep paralysis Little is known about the physiology of sleep paralysis. However, some have suggested that it may be linked to post-synaptic (neurons sending signals to other neurons) inhibition (restraint) of nerves in the pons (back) region of the brain. In particular, low levels of melatonin may stop the depolarization current in the nerves, which stops stimulation of the muscles.', 'Sleep Sleep is a state of resting, which happens in a

In [14]:
# upload to hf
data.push_to_hub(
    repo_id=DATASET,
    commit_message="Classified docs per choice",
    token=hf_api_key,
    private=True
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 19.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.76s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LeoZotos/bio_full/commit/00d86c049741d0643a15a88b6bf8c2d0001381f8', commit_message='Classified docs per choice', commit_description='', oid='00d86c049741d0643a15a88b6bf8c2d0001381f8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LeoZotos/bio_full', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LeoZotos/bio_full'), pr_revision=None, pr_num=None)