In [2]:
from src import Icsr
from src.utils import get_matches

import datasets
import random
from copy import deepcopy

In [3]:
# load matches
dataset = datasets.load_dataset("FAERS-PubMed/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))

Using custom data configuration FAERS-PubMed--raw_dataset-0b83cc0b498dbbb2
Found cached dataset json (/Users/kldooste/.cache/huggingface/datasets/FAERS-PubMed___json/FAERS-PubMed--raw_dataset-0b83cc0b498dbbb2/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 63.66it/s]


65648


In [4]:
# for every article, parse all the reports
icsrs = []

for m in matches:
    new_icsrs = [(index, Icsr.from_report(r)) for index, r in enumerate(m.reports)]
    new_icsrs = [t for t in new_icsrs if t[1]]
    icsrs.append(new_icsrs)

all_icsrs = [i for ls in icsrs for i in ls]

In [5]:
# for every article, sample on priviledged report and put all the others in a list
random.seed(42)

sampled_icsrs = []
other_icsrs = []

for ls in icsrs:
    sampled = None
    other = []
    if ls:
        sampled = random.choice(ls)
        other = deepcopy(ls)
        other.remove(sampled)
    sampled_icsrs.append(sampled)
    other_icsrs.append(other)


In [6]:
print(f'number of total matches: \t\t{len(matches):,}')
print(f'number of articles with >=1 icsr: \t{len([i for i in sampled_icsrs if i]):,}')
print(f'number of articles with >1 icsr: \t{len([i for i in other_icsrs if i]):,}')

number of total matches: 		65,648
number of articles with >=1 icsr: 	51,212
number of articles with >1 icsr: 	27,377


In [7]:
# validate the priviledged icsr against a random icsr with the same report
# validate the priviledged icsr against a random icsr from a random report
random.seed(42)

similar_scores = []
random_scores = []
for sampled, others in zip(sampled_icsrs, other_icsrs):
    if others:
        other = random.choice(others)
        
        sampled_icsr = sampled[1]
        other_icsr = other[1]

        similar_scores.append(sampled_icsr.score(other_icsr))

        random_other = random.choice(all_icsrs)[1]
        random_scores.append(sampled_icsr.score(random_other))
        

In [9]:
# aggregate scores across precision, recall and f1

def agg_scores(list, index):
    ls = [l[index] for l in list]
    return sum(ls) / len(ls)

print(agg_scores(random_scores,0))
print(agg_scores(random_scores,1))
print(agg_scores(random_scores,2))
print('')
print(agg_scores(similar_scores,0))
print(agg_scores(similar_scores,1))
print(agg_scores(similar_scores,2))

0.2432435244969848
0.24388269245800143
0.2428201908688846

0.728383086816412
0.7286750223845676
0.7204383709761228
