# Similarities between combinations -- Lyssna tests

## 1. Data import

In [1]:
import os
import pandas as pd

In [2]:
script_path = os.getcwd()
datapath = os.path.join(script_path,'data')

In [3]:
most_likely = pd.read_csv(os.path.join(datapath,'most_likely.tsv'),delimiter='\t',header=0)
most_likely

Unnamed: 0.1,Unnamed: 0,round,test,search term,scope,domain,test type,selection,participants,selection time
0,0,1,1,influenza,very broad,Infectious Disease,M,c1,2.0,34.0
1,2,1,1,long covid,broad,Infectious Disease,M,c3,2.0,107.0
2,4,1,1,malaria therapeutics,very specific,Infectious Disease,M,c2,2.0,31.0
3,6,1,2,asthma,very broad,Allergy,M,c4,3.0,56.0
4,8,1,2,naegleria fowleri infection,very specific,Infectious Disease,M,c11,3.0,23.0
...,...,...,...,...,...,...,...,...,...,...
367,734,4,5,plasmacytoid dendritic cells,somewhat specific,Cell types,M,c1,0.0,
368,736,4,5,tuberculin skin test,somewhat specific,Experimental techniques,M,c15,0.0,
369,738,4,6,hiv,very broad,Infectious Disease,M,c14,1.0,56.0
370,740,4,6,influenza,very broad,Infectious Disease,M,c3,0.0,


In [4]:
most_likely['search term'].unique()

array(['influenza', 'long covid', 'malaria therapeutics', 'asthma',
       'naegleria fowleri infection', 'zika microcephaly',
       'allergen skin prick test', 'allergy treatment',
       'sublingual immunotherapy', 'AIDS', 'immunotherapeutics',
       't-cell function', "addison's disease", 'cancer',
       'myocardial infarction', 'dendritic cells', 'mast cells',
       'rational cancer drug design', 'metabolomics', 'pinealocyte',
       'plasmacytoid dendritic cells', 'gwas', 'mycobacterium',
       'tuberculin skin test', 'hiv'], dtype=object)

## 2. Data wrangling

In [5]:
# Remove unneeded columns
most_likely.drop(columns = ['Unnamed: 0', 'scope', 'participants', 'selection time'], inplace = True)
most_likely

Unnamed: 0,round,test,search term,domain,test type,selection
0,1,1,influenza,Infectious Disease,M,c1
1,1,1,long covid,Infectious Disease,M,c3
2,1,1,malaria therapeutics,Infectious Disease,M,c2
3,1,2,asthma,Allergy,M,c4
4,1,2,naegleria fowleri infection,Infectious Disease,M,c11
...,...,...,...,...,...,...
367,4,5,plasmacytoid dendritic cells,Cell types,M,c1
368,4,5,tuberculin skin test,Experimental techniques,M,c15
369,4,6,hiv,Infectious Disease,M,c14
370,4,6,influenza,Infectious Disease,M,c3


## 3. Similarity computation using the Jaccard index

In [6]:
# Create sets of result ids for each combination
sets = most_likely.groupby('selection')['search term'].apply(set)

# Jaccard similarity between sets
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Calculate similarities
similarities = {}
for combi1 in sets.index:
    for combi2 in sets.index:
        if combi1 != combi2 and (combi2, combi1) not in similarities:
            similarity = jaccard_similarity(sets[combi1], sets[combi2])
            similarities[(combi1, combi2)] = similarity

print("\nJaccard similarities:")
for (combi1, combi2), similarity in similarities.items():
    print(f"Similarity between set_{combi1} and set_{combi2}: {similarity}")


Jaccard similarities:
Similarity between set_c1 and set_c10: 0.92
Similarity between set_c1 and set_c11: 0.76
Similarity between set_c1 and set_c12: 0.72
Similarity between set_c1 and set_c13: 0.84
Similarity between set_c1 and set_c14: 0.92
Similarity between set_c1 and set_c15: 0.84
Similarity between set_c1 and set_c2: 0.88
Similarity between set_c1 and set_c3: 0.84
Similarity between set_c1 and set_c4: 0.92
Similarity between set_c1 and set_c5: 0.88
Similarity between set_c1 and set_c6: 0.52
Similarity between set_c1 and set_c7: 0.72
Similarity between set_c1 and set_c8: 0.84
Similarity between set_c1 and set_c9: 0.84
Similarity between set_c10 and set_c11: 0.75
Similarity between set_c10 and set_c12: 0.7083333333333334
Similarity between set_c10 and set_c13: 0.8333333333333334
Similarity between set_c10 and set_c14: 0.9166666666666666
Similarity between set_c10 and set_c15: 0.76
Similarity between set_c10 and set_c2: 0.8
Similarity between set_c10 and set_c3: 0.8333333333333334
S