### Similarity and other stats

In [2]:
import os
import numpy as np
import pandas as pd
from collections import namedtuple

In [3]:
df = pd.read_csv('../preprocessed/attribute.seqs.tsv', sep = '\t', names=['name', 'text'], dtype=object, encoding='utf-8').dropna()

In [4]:
attributes=df.groupby(by=["name"]).count().reset_index().rename(columns={"text": "num_entries"})
attributes['max_pairs'] = ((attributes.num_entries * (attributes.num_entries - 1))/2).astype(int)
attributes.head(3)

Unnamed: 0,name,num_entries,max_pairs
0,bio_material_sam,9295,43193865
1,biological_material_preprocessing_sam,17,136
2,collection_label_sam,1477,1090026


In [5]:
classes = attributes.name.values.tolist()

In [6]:
def get_overlap_score(x, y):
    s = np.sum(x) + np.sum(y)
    if s == 0:
        return 0
    return 2 * np.sum(x * y) / s

In [7]:
Xyz = namedtuple('Xyz', ['jac', 'rou', 'bow', 'llm'])

locations = ['../scores/%(a)s.jaccard.scores.npy', 
             '../scores/%(a)s.jaccard.scores.npy',
             '../results/fasttext-similarity-matrix/fasttext-%(a)s.similarity-score-matrix.npy',
             '../results/sbert-similarity-matrics/sbert-%(a)s.similarity-score-matrix.npy']

def get_matrices(name):
    ll = [p % {'a': name} for p in locations]
    for l in ll:
        if not os.path.exists(l):
            return None

    return Xyz(*[np.load(l) for l in ll])

#### Overlap between predicted similarity scores, jaccard and rouge1

In [8]:
def get_overlaps(names, threshold: float = 0.75, rj_threshold: float = 0.75):
    _scores = []
    
    for name in names:
        mm = get_matrices(name)
        if mm is None:
            print(f'skipping {name}')
            continue

        bk = (mm.jac + mm.rou > rj_threshold).astype(int)
        xb = (np.tril(mm.bow, k=-1) > threshold).astype(int)
        xl = (np.tril(mm.llm, k=-1) > threshold).astype(int)

        _scores.append([name,
                        np.sum(xl),
                        np.sum(xb),
                        np.sum(bk),
                        get_overlap_score(xb, xl),
                        get_overlap_score(bk, xl),
                        get_overlap_score(bk, xb),
                       ]) 

    return pd.DataFrame(_scores, columns=['name', 'llm_sim', 'bow_sim', 'bg_sim', 'bow_over_llm', 'llm_over_bg', 'bow_over_bg']).dropna()

#### Threshold = .85

In [9]:
%%time
relation_scores = get_overlaps(attributes.name.values.tolist(), threshold=0.85)

skipping bio_material_sam
skipping description_sam
skipping isolate_run
skipping misc_param_sam
skipping sample_comment_sam
CPU times: user 1.65 s, sys: 984 ms, total: 2.63 s
Wall time: 2.63 s


In [10]:
stats = attributes.merge(relation_scores, on='name')

In [11]:
stats['sim_ratio'] = (stats.llm_sim/stats.max_pairs)
stats.sort_values(by='llm_sim', ascending=False)
stats_sorted = stats.sort_values(by='llm_sim', ascending=False)
stats_sorted.style.hide()

name,num_entries,max_pairs,llm_sim,bow_sim,bg_sim,bow_over_llm,llm_over_bg,bow_over_bg,sim_ratio
subsrc_note_sam,1685,1418770,889962,1052016,1050739,0.916551,0.917141,0.999381,0.627277
time_of_collection_sam,3350,5609575,583124,260230,3661,0.102209,0.008432,0.026011,0.103952
descriptor_sam,1373,941878,383391,605394,813595,0.771782,0.64029,0.853275,0.40705
specimen_sam,3483,6063903,288310,50,135,6.9e-05,0.000118,0.4,0.047545
description_exp,4133,8538778,181690,16914,8188,0.052366,0.041753,0.592542,0.021278
date_of_birth_sam,2073,2147628,164227,1,3,1.2e-05,2.4e-05,0.5,0.076469
sampling_date_run,489,119316,76247,66,12,0.0,0.0,0.0,0.639034
details_sam,1313,861328,35956,18819,15696,0.646134,0.58294,0.864783,0.041745
derived_from_sam,471,110685,25399,35453,33348,0.258036,0.267146,0.967777,0.229471
collection_label_sam,1477,1090026,20026,25,3,0.001596,0.0001,0.142857,0.018372


In [12]:
stats_sorted.to_csv('stats.csv', index=None, float_format="%.2f")