In [1]:
from src import Icsr
from src.utils import get_matches

import datasets
import random
from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load matches
dataset = datasets.load_dataset("BioDEX/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))

Using custom data configuration BioDEX--raw_dataset-0b83cc0b498dbbb2
Found cached dataset json (/Users/kldooste/.cache/huggingface/datasets/BioDEX___json/BioDEX--raw_dataset-0b83cc0b498dbbb2/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 65.84it/s]


65648


### Simplest IAA
priviledged icsr vs random icsr

In [3]:
# for every article, parse all the reports
icsrs = []

for m in matches:
    new_icsrs = [(index, Icsr.from_report(r)) for index, r in enumerate(m.reports)]
    new_icsrs = [t for t in new_icsrs if t[1]]
    icsrs.append(new_icsrs)

all_icsrs = [i for ls in icsrs for i in ls]

In [4]:
# for every article, sample on priviledged report and put all the others in a list
random.seed(42)

sampled_icsrs = []
other_icsrs = []

for ls in icsrs:
    sampled = None
    other = []
    if ls:
        sampled = random.choice(ls)
        other = deepcopy(ls)
        other.remove(sampled)
    sampled_icsrs.append(sampled)
    other_icsrs.append(other)


In [5]:
print(f'number of total matches: \t\t{len(matches):,}')
print(f'number of articles with >=1 icsr: \t{len([i for i in sampled_icsrs if i]):,}')
print(f'number of articles with >1 icsr: \t{len([i for i in other_icsrs if i]):,}')

number of total matches: 		65,648
number of articles with >=1 icsr: 	51,212
number of articles with >1 icsr: 	27,377


In [6]:
# validate the priviledged icsr against a random icsr with the same report
# validate the priviledged icsr against a random icsr from a random report
random.seed(42)

similar_scores = []
random_scores = []
for sampled, others in zip(sampled_icsrs, other_icsrs):
    if others:
        other = random.choice(others)
        
        sampled_icsr = sampled[1]
        other_icsr = other[1]

        similar_scores.append(sampled_icsr.score(other_icsr))

        random_other = random.choice(all_icsrs)[1]
        random_scores.append(sampled_icsr.score(random_other))
        

In [8]:
# aggregate scores across precision, recall and f1

def agg_scores(list, index):
    ls = [l[index] for l in list]
    return sum(ls) / len(ls)

print(agg_scores(random_scores,0))
print(agg_scores(random_scores,1))
print(agg_scores(random_scores,2))
print('')
print(agg_scores(similar_scores,0))
print(agg_scores(similar_scores,1))
print(agg_scores(similar_scores,2))
print('')
print(f'Calculated over {len(similar_scores):,} applicable examples.')

0.2432435244969848
0.24388269245800143
0.2428201908688846

0.728383086816412
0.7286750223845676
0.7204383709761228

Calculated over 27,377 applicable examples.


### Different IAA
priviledged icsr vs random icsr that is not of the same company

In [25]:
# for every article, parse all the reports
# also keep track of the company that submitted the icsr
def get_company(report):
    comp = None
    if report.companynumb:
        if '-' in report.companynumb:
            comp = report.companynumb.split('-')[1].strip().title()
    return comp

icsrs = []

for m in matches:
    new_icsrs = [(index, get_company(r) ,Icsr.from_report(r)) for index, r in enumerate(m.reports)]
    new_icsrs = [t for t in new_icsrs if t[1] and t[2]]
    icsrs.append(new_icsrs)

all_icsrs = [i for ls in icsrs for i in ls]

In [27]:
# for every article, sample on priviledged report and put all the others in a list
random.seed(42)

sampled_icsrs = []
other_icsrs = []

for ls in icsrs:
    sampled = None
    other = []
    if ls:
        sampled = random.choice(ls)
        other = deepcopy(ls)
        other.remove(sampled)
    sampled_icsrs.append(sampled)
    other_icsrs.append(other)


In [31]:
print(f'number of total matches: \t\t{len(matches):,}')
print(f'number of articles with >=1 icsr: \t{len([i for i in sampled_icsrs if i]):,}')
print(f'number of articles with >1 icsr: \t{len([i for i in other_icsrs if i]):,}')

number of total matches: 		65,648
number of articles with >=1 icsr: 	48,708
number of articles with >1 icsr: 	25,675


In [38]:
# validate the priviledged icsr against a random icsr with the same report
# validate the priviledged icsr against a random icsr from a random report
random.seed(42)

similar_scores = []
random_scores = []
for sampled, others in zip(sampled_icsrs, other_icsrs):
    if others:
        other = random.choice(others)
        

        # only compare for different companies
        if sampled[1] != other[1]:
            sampled_icsr = sampled[2]
            other_icsr = other[2]
        
            similar_scores.append(sampled_icsr.score(other_icsr))

            random_other = random.choice(all_icsrs)[2]
            random_scores.append(sampled_icsr.score(random_other))

print(f'Made {len(similar_scores)} comparison same article different company')
        

Made 19254 comparison same article different company


In [39]:
print(agg_scores(random_scores,0))
print(agg_scores(random_scores,1))
print(agg_scores(random_scores,2))
print('')
print(agg_scores(similar_scores,0))
print(agg_scores(similar_scores,1))
print(agg_scores(similar_scores,2))
print('')
print(f'Calculated over {len(similar_scores):,} applicable examples.')

0.24530324070215034
0.24606646000919297
0.24492883153940762

0.7316943442414694
0.7321459690671169
0.7233746876594158

Calculated over 19,254 applicable examples.
