In [1]:
import sys
import os
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from scipy.stats import pearsonr

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
cache_dir = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read()) + '/cache'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET = "LeoZotos/bio_full"

In [3]:
hf_api_key = ""
with open("../tokens/HF_TOKEN.txt", "r") as f:
    hf_api_key = f.read().strip()

In [4]:
data = load_dataset(DATASET, split='train', token=hf_api_key, cache_dir=cache_dir)

Generating train split: 100%|██████████| 778/778 [00:00<00:00, 12007.80 examples/s]


In [5]:
data = data.filter(lambda x: x['Answer_A_Rate'] is not None and x['Answer_B_Rate'] is not None and x['Answer_C_Rate'] is not None and x['Answer_D_Rate'] is not None)
# data = data.filter(lambda x: x['Has_Content_Distractors'] == 2)

print("After filtering, dataset size:", len(data))

Filter: 100%|██████████| 778/778 [00:00<00:00, 1418.62 examples/s]

After filtering, dataset size: 777





In [6]:
correlations_with_docs_len = {}

for choice_name in [f"Answer_{choice}" for choice in ['A', 'B', 'C', 'D']]:
    rates = data[f'{choice_name}_Rate']
    doc_lengths = [len(sentence_list) for sentence_list in data[f'{choice_name}_Docs']]
    
    correlation, p = pearsonr(rates, doc_lengths)
    
    correlations_with_docs_len[choice_name] = (correlation, p)
    print(f"Correlation between {choice_name} rate and document length: {correlation:.3f}, p-value: {p:.3f}")

Correlation between Answer_A rate and document length: 0.077, p-value: 0.032
Correlation between Answer_B rate and document length: 0.111, p-value: 0.002
Correlation between Answer_C rate and document length: 0.062, p-value: 0.084
Correlation between Answer_D rate and document length: nan, p-value: nan
