In [1]:
import os
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np
from scipy.stats import pearsonr, spearmanr

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
cache_dir = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read()) + '/cache'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET = "LeoZotos/immu_full"
WIKI = "Simple" # 'En' or 'Simple'
SOURCE_TEXT = "Only_Options"  # 'Only_Options' or 'Question_With_Options'
NUM_DOCS_RETRIEVED = 20 # 20 40 or 60

RETRIEVED_DOCS_COL_NAME = 'Relevant_Docs_' + WIKI + '_' + SOURCE_TEXT + '_' + str(NUM_DOCS_RETRIEVED) # for the docs retrieval only based on the choices

In [3]:
hf_api_key = ""
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
sentence_bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
data = load_dataset(DATASET, split='train', token=hf_api_key, cache_dir=cache_dir)

In [6]:
def print_similarities(similarities, sentences1, sentences2):
    for idx_i, sentence1 in enumerate(sentences1):
        print(sentence1)
        for idx_j, sentence2 in enumerate(sentences2):
            print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")


def classify_docs_per_distractor(row, sentence_bert_model):
    docs_per_choice = {}
    for key in ['Answer_A', 'Answer_B', 'Answer_C', 'Answer_D']:
        if row[key] != "":
                docs_per_choice[key + '_Docs'] = []
    choices_keys = [key[:-5] for key in docs_per_choice.keys()]
    choices_content = [row[key] for key in choices_keys if row[key] != ""]
    embeddings_choices = sentence_bert_model.encode(choices_content)
    embeddings_docs = sentence_bert_model.encode(row[RETRIEVED_DOCS_COL_NAME])
    similarities = sentence_bert_model.similarity(embeddings_choices, embeddings_docs)

    # We now add each doc to the choice with the highest similarity
    for i, doc in enumerate(row[RETRIEVED_DOCS_COL_NAME]):
        max_sim_index = np.argmax(similarities[:, i])
        max_choice = list(docs_per_choice.keys())[max_sim_index]
        docs_per_choice[max_choice].append(doc)
    
    return docs_per_choice

In [7]:
column_names = [f"Answer_{choice}_Docs" for choice in ['A', 'B', 'C', 'D']]
docs_by_choice = {name: [] for name in column_names}

for row in tqdm(data):
    docs_per_choice_for_row = classify_docs_per_distractor(row, sentence_bert_model)
    for name in column_names:
        docs_by_choice[name].append(docs_per_choice_for_row.get(name, []))
        
if column_names[0] in data.column_names:
    data = data.remove_columns(column_names)
    
for name, column_data in docs_by_choice.items():
    data = data.add_column(name, column_data)

  0%|          | 0/843 [00:00<?, ?it/s]

100%|██████████| 843/843 [00:12<00:00, 65.46it/s]


In [8]:
# Inspect an instance manually to see if it makes sense
id = 22
print(data[id]['Question_With_Options'], ":", "\n A:", data[id]['Answer_A_Docs'], "\n B:", data[id]['Answer_B_Docs'], "\n C:", data[id]['Answer_C_Docs'], "\n D:", data[id]['Answer_D_Docs'])

M cells are crucial for the initiation of mucosal immunity. Where can these M cells be found?
A) In Peyer's patches
B) In GALT and in between intestinal epithelial cells
C) In parts of the entire ileum that have contact with mesenteric lymph nodes
D) In lymph nodes
 : 
 A: [] 
 B: ['Duodenal cancer The duodenum is the first part of the small intestine. It is located between the stomach and the jejunum. After foods combine with stomach acid, they go down into the duodenum where they mix with bile from the gall bladder and digestive juices from the pancreas.', 'Gastrointestinal disease Gastrointestinal diseases are about diseases that affect the gastrointestinal tract. They include the oesophagus, stomach, small intestine, large intestine, rectum, the liver, gallbladder, and pancreas.', 'Organ system Digestive system: digestion and processing food with salivary glands, esophagus, stomach, liver, gallbladder, pancreas, intestines, rectum and anus.', 'Intestine The intestines, divided into

## Calculate Correlation

In [9]:
data = data.filter(lambda x: x['Answer_A_Rate'] is not None and x['Answer_B_Rate'] is not None and x['Answer_C_Rate'] is not None and x['Answer_D_Rate'] is not None)
data = data.filter(lambda x: x['Has_Content_Distractors'] == 2)

print("After filtering, dataset size:", len(data))

After filtering, dataset size: 338


In [None]:
def calc_correlation(type='pearson'):
    
    print(type.capitalize(), "correlation between distractor rates and document lengths(A-D, p-values in between):")
    correlations_with_docs_len = {}

    for choice_name in [f"Answer_{choice}" for choice in ['A', 'B', 'C', 'D']]:
        rates = data[f'{choice_name}_Rate']
        doc_lengths = [len(sentence_list) for sentence_list in data[f'{choice_name}_Docs']]
        correlation, p = None, None
        if type == 'pearson':
            correlation, p = pearsonr(rates, doc_lengths)
        elif type == 'spearman':
            correlation, p = spearmanr(rates, doc_lengths)
        
        correlations_with_docs_len[choice_name] = (round(correlation,4), round(p,4))
        # print(choice_name,"\n", round(correlation,4), "\t", round(p,4))
    
    correlations_string = "\t".join(
        [f"{str(correlation)} {str(p)}"
         for (correlation, p) in correlations_with_docs_len.values()]
    )  
    print(correlations_string)

In [19]:
calc_correlation('pearson')
calc_correlation('spearman')

Pearson correlation between distractor rates and document lengths:
-0.0161 0.768	-0.0154 0.7775	-0.0062 0.9091	-0.0594 0.2761
Spearman correlation between distractor rates and document lengths:
-0.024 0.6597	-0.0185 0.7349	-0.0168 0.7589	-0.0019 0.9726
