In [10]:
import os
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np
from scipy.stats import pearsonr, spearmanr

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
cache_dir = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read()) + '/cache'

In [11]:
DATASET = "LeoZotos/bio_full"
WIKI = "En" # 'En' or 'Simple'
SOURCE_TEXT = ""  # '_Only_Options' or '' for full text
NUM_DOCS_RETRIEVED = 60 # 20 or 60

RETRIEVED_DOCS_COL_NAME = 'Relevant_Docs_' + WIKI + SOURCE_TEXT + '_' + str(NUM_DOCS_RETRIEVED)

In [12]:
hf_api_key = ""
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
sentence_bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [14]:
data = load_dataset(DATASET, split='train', token=hf_api_key, cache_dir=cache_dir)

In [15]:
def print_similarities(similarities, sentences1, sentences2):
    for idx_i, sentence1 in enumerate(sentences1):
        print(sentence1)
        for idx_j, sentence2 in enumerate(sentences2):
            print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")


def classify_docs_per_distractor(row, sentence_bert_model):
    docs_per_choice = {}
    for key in ['Answer_A', 'Answer_B', 'Answer_C', 'Answer_D']:
        if row[key] != "":
                docs_per_choice[key + '_Docs'] = []
    choices_keys = [key[:-5] for key in docs_per_choice.keys()]
    choices_content = [row[key] for key in choices_keys if row[key] != ""]
    embeddings_choices = sentence_bert_model.encode(choices_content)
    embeddings_docs = sentence_bert_model.encode(row[RETRIEVED_DOCS_COL_NAME])
    similarities = sentence_bert_model.similarity(embeddings_choices, embeddings_docs)

    # We now add each doc to the choice with the highest similarity
    for i, doc in enumerate(row[RETRIEVED_DOCS_COL_NAME]):
        max_sim_index = np.argmax(similarities[:, i])
        max_choice = list(docs_per_choice.keys())[max_sim_index]
        docs_per_choice[max_choice].append(doc)
    
    return docs_per_choice

In [16]:
column_names = [f"Answer_{choice}_Docs" for choice in ['A', 'B', 'C', 'D']]
docs_by_choice = {name: [] for name in column_names}

for row in tqdm(data):
    docs_per_choice_for_row = classify_docs_per_distractor(row, sentence_bert_model)
    for name in column_names:
        docs_by_choice[name].append(docs_per_choice_for_row.get(name, []))
        
if column_names[0] in data.column_names:
    data = data.remove_columns(column_names)
    
for name, column_data in docs_by_choice.items():
    data = data.add_column(name, column_data)

  0%|          | 0/778 [00:00<?, ?it/s]

100%|██████████| 778/778 [00:30<00:00, 25.22it/s]


In [17]:
# Inspect an instance manually to see if it makes sense
id = 22
print(data[id]['Question_With_Options'], ":", "\n A:", data[id]['Answer_A_Docs'], "\n B:", data[id]['Answer_B_Docs'], "\n C:", data[id]['Answer_C_Docs'], "\n D:", data[id]['Answer_D_Docs'])

what is one of the contradictions in "paradoxical" sleep?
a) the brain is very active, while many of the muscles are deeply relaxed
b) subcortical structures are very active, while the cerebral cortex is inactive
c) the frequency of the brain waves is low, while the amplitude is high
d) postural muscles are tense, while heart rate and breathing rate are very low
 : 
 A: ['Rapid eye movement sleep Generally speaking, the body suspends homeostasis during paradoxical sleep. Heart rate, cardiac pressure, cardiac output, arterial pressure, and breathing rate quickly become irregular when the body moves into REM sleep. In general, respiratory reflexes such as response to hypoxia diminish. Overall, the brain exerts less control over breathing; electrical stimulation of respiration-linked brain areas does not influence the lungs, as it does during non-REM sleep and in waking. The fluctuations of heart rate and arterial pressure tend to coincide with PGO waves and rapid eye movements, twitches,

## Calculate Correlation

In [18]:
data = data.filter(lambda x: x['Answer_A_Rate'] is not None and x['Answer_B_Rate'] is not None and x['Answer_C_Rate'] is not None and x['Answer_D_Rate'] is not None)
# data = data.filter(lambda x: x['Has_Content_Distractors'] == 2)

print("After filtering, dataset size:", len(data))

After filtering, dataset size: 777


In [19]:
def calc_correlation(type='pearson'):
    
    print(type.capitalize(), "correlation between distractor rates and document lengths(A-D, p-values in between):")
    correlations_with_docs_len = {}

    for choice_name in [f"Answer_{choice}" for choice in ['A', 'B', 'C', 'D']]:
        rates = data[f'{choice_name}_Rate']
        doc_lengths = [len(sentence_list) for sentence_list in data[f'{choice_name}_Docs']]
        correlation, p = None, None
        if type == 'pearson':
            correlation, p = pearsonr(rates, doc_lengths)
        elif type == 'spearman':
            correlation, p = spearmanr(rates, doc_lengths)
        
        correlations_with_docs_len[choice_name] = (round(correlation,4), round(p,4))
        # print(choice_name,"\n", round(correlation,4), "\t", round(p,4))
    
    correlations_string = "\t".join(
        [f"{str(correlation)} {str(p)}"
         for (correlation, p) in correlations_with_docs_len.values()]
    )  
    print(correlations_string)

In [20]:
calc_correlation('pearson')
calc_correlation('spearman')

Pearson correlation between distractor rates and document lengths(A-D, p-values in between):
0.1311 0.0002	0.1456 0.0	0.0757 0.0349	nan nan
Spearman correlation between distractor rates and document lengths(A-D, p-values in between):
0.1283 0.0003	0.1269 0.0004	0.0971 0.0067	nan nan
