## Similarity Matrix

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from rdkit.Chem import rdFingerprintGenerator

filename = "compounds_ms2structures"
path = os.path.join( "data", "datasets", filename + ".csv")

compounds = pd.read_csv(path)
compounds.head()

Unnamed: 0,inchikey,smiles,mass,cf_class,cf_subclass,cf_superclass,formula,npc_class_results,npc_pathway_results,npc_superclass_results
0,AAAQFGUYHFJNHI,CCNC(=O)C[C@H]1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N...,423.146204,Benzodiazepines,"1,4-benzodiazepines",Organoheterocyclic compounds,C22H22ClN5O2,,Alkaloids,
1,AABFWJDLCCDJJN,COC1=CC2=C(C=C1)NC3=C2C=CN=C3C4=CC=CC5=CC=CC=C54,324.126264,Harmala alkaloids,,Alkaloids and derivatives,C22H16N2O,Carboline alkaloids,Alkaloids,Tryptophan alkaloids
2,AABILZKQMVKFHP,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,427.220624,,,Alkaloids and derivatives,C21H33NO8,Pyrrolizidine alkaloids,Alkaloids,Ornithine alkaloids
3,AABUHSBGEIUSRJ,CC(=O)NC1=CC=C(C=C1)NC(=O)C=CC2=CC=CC=C2,280.120724,Cinnamic acids and derivatives,Cinnamic acid amides,Phenylpropanoids and polyketides,C17H16N2O2,Cinnamic acid amides,Shikimates and Phenylpropanoids,Phenylpropanoids (C6-C3)
4,AABUKWVVUWBZCS,C1=CC=C(C=C1)C2=C(C(=O)OC3=C2C=CC(=C3)O)C4=CC=...,314.094724,Neoflavonoids,Neoflavones,Phenylpropanoids and polyketides,C21H14O3,Neoflavonoids,Shikimates and Phenylpropanoids,Flavonoids


In [2]:
compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37811 entries, 0 to 37810
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   inchikey                37811 non-null  object 
 1   smiles                  37811 non-null  object 
 2   mass                    37811 non-null  float64
 3   cf_class                37685 non-null  object 
 4   cf_subclass             32652 non-null  object 
 5   cf_superclass           37810 non-null  object 
 6   formula                 37804 non-null  object 
 7   npc_class_results       24322 non-null  object 
 8   npc_pathway_results     33661 non-null  object 
 9   npc_superclass_results  25913 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.9+ MB


In [3]:
compounds.smiles[12]

'CCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)([O-])OCC[N+](C)(C)C)[C@@H](/C=C/CCCCCCCCCCCCC)O'

In [5]:
# code taken from https://github.com/florian-huber/molecular_fingerprint_comparisons

import numba
from numba import prange
import numpy as np
from fingerprint_computation import FingerprintGenerator, compute_fingerprints_from_smiles

@numba.njit
def ruzicka_similarity(A, B):
    """
    Calculate the Ruzicka similarity between two count vectors.
    
    Parameters:
    A (array-like): First count vector.
    B (array-like): Second count vector.
    
    Returns:
    float: Ruzicka similarity.
    """
    
    min_sum = np.sum(np.minimum(A, B))
    max_sum = np.sum(np.maximum(A, B))
    
    return min_sum / max_sum


@numba.jit(nopython=True, fastmath=True, parallel=True)
def ruzicka_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
    """Returns matrix of Ruzicka similarity between all-vs-all vectors of references and queries.

    Parameters
    ----------
    references
        Reference vectors as 2D numpy array. Expects that vector_i corresponds to
        references[i, :].
    queries
        Query vectors as 2D numpy array. Expects that vector_i corresponds to
        queries[i, :].

    Returns
    -------
    scores
        Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
        between the vectors references[i, :] and queries[j, :].
    """
    assert references.shape[1] == queries.shape[1], "Vector sizes do not match!"

    size1 = references.shape[0]
    size2 = queries.shape[0]
    scores = np.zeros((size1, size2)) #, dtype=np.float32)
    for i in prange(size1):
        for j in range(size2):
            scores[i, j] = ruzicka_similarity(references[i, :], queries[j, :])
    return scores


def compute_similarity_matrix(compounds, sim_matrix_file, morgan_radius=9, fpSize=4096):
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=morgan_radius, fpSize=fpSize)
    fingerprints_morgan_count = compute_fingerprints_from_smiles(compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True)
    similarities_morgan_count = ruzicka_similarity_matrix(fingerprints_morgan_count, fingerprints_morgan_count)
    np.save(sim_matrix_file, similarities_morgan_count.astype(np.float32)) # big one ~5GB
    return np.load(sim_matrix_file, mmap_mode ='r')

In [6]:
morgan_radius=9
fpSize=2048

fingerprints_file = os.path.join("data", "group_similarity", f"{filename}_fingerprints_morgan{morgan_radius}_{fpSize}bits.npy")
try:
    fingerprints = np.load(fingerprints_file, mmap_mode ='r')
    print(f"File {fingerprints_file} found. Loading fingerprints.")
except FileNotFoundError:
    print(f"File {fingerprints_file} not found. Running the fingerprint generation.")
    # fingerprint generation
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=morgan_radius, fpSize=fpSize)
    fingerprints = compute_fingerprints_from_smiles(compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True)
    np.save(fingerprints_file, fingerprints.astype(np.float32))
    fingerprints = np.load(fingerprints_file, mmap_mode ='r')

# sim matrix generation
sim_matrix_file = os.path.join("data", "group_similarity", f"{filename}_ruzicka_similarities_morgan{morgan_radius}_{fpSize}bits.npy")
try:
    sim_matrix = np.load(sim_matrix_file, mmap_mode ='r')
    print(f"File {sim_matrix_file} found. Loading similarity matrix.")
except FileNotFoundError:
    print(f"File {sim_matrix_file} not found. Running the fingerprint & Similarity computation.")
    %time
    sim_matrix = compute_similarity_matrix(compounds, sim_matrix_file)

print(fingerprints.shape, sim_matrix.shape)

File data\group_similarity\compounds_ms2structures_fingerprints_morgan9_2048bits.npy found. Loading fingerprints.
File data\group_similarity\compounds_ms2structures_ruzicka_similarities_morgan9_2048bits.npy found. Loading similarity matrix.
(37811, 2048) (37811, 37811)


## Queries wih similiar groups

#### randomly select 30 analogues, if there are any

In [7]:
len(sim_matrix)

37811

In [10]:
sim_matrix[:10]

memmap([[1.        , 0.14537445, 0.06024097, ..., 0.14432989, 0.09448819,
         0.08896797],
        [0.14537445, 1.        , 0.03361345, ..., 0.18604651, 0.07438017,
         0.04727273],
        [0.06024097, 0.03361345, 1.        , ..., 0.04      , 0.10460251,
         0.13618676],
        ...,
        [0.05625   , 0.02731092, 0.11286682, ..., 0.025     , 0.06736842,
         0.11924686],
        [0.06586827, 0.0738255 , 0.01863354, ..., 0.07964602, 0.05952381,
         0.02487562],
        [0.03233831, 0.02319588, 0.06933333, ..., 0.01126761, 0.04010025,
         0.07263923]], dtype=float32)

In [None]:
import pandas as pd
import random

def select_analogue_groups(similarity_matrix, group_size=30, sim_range=(0.8, 0.9999), overlap=False, seed=42, print_mean_similarity=False):
    """
    Selects groups of similar compounds based on a similarity matrix.
    Each group contains a specified number of compounds that are similar to a given compound within a defined similarity range.
    Parameters: 
        similarity_matrix (np.ndarray): A 2D numpy array where each row represents a compound and each column represents the similarity to other compounds.
        group_size (int): The number of similar compounds to select for each compound.
        sim_range (tuple): A tuple defining the lower and upper bounds of the similarity range to consider for selecting similar compounds.
        no_overlap (bool): If True, ensures that selected compounds do not overlap with previously selected compounds.
        seed (int): Random seed for reproducibility.
        print_mean_similarity (bool): If True, prints the mean similarity of the selected compounds for each compound.
        Returns:
            analogue_dict (dict): A dictionary where keys are indices of the original compounds and values are lists of indices of selected similar compounds.
    """
    random.seed(seed)
    analogue_df = pd.DataFrame(columns=["query_id", "analogue_ids"])
    used_indices =[]
    for i in tqdm(range(len(similarity_matrix)), desc="Selecting analogue groups"):
        similar_indices = np.where((similarity_matrix[i] >= sim_range[0]) & (similarity_matrix[i] <= sim_range[1]))[0]
        # remove already used ids from similar_indices
        similar_indices = [idx for idx in similar_indices if idx not in used_indices]
        # check if group size is large enough
        if len(similar_indices) >= group_size:
            if print_mean_similarity:
                mean_similarity = np.mean(similarity_matrix[i][similar_indices])
                print(f"Index {i}: Found {len(similar_indices)} similar compounds with mean similarity {mean_similarity:.3f}. Picking {group_size} random matches.")
            
            random_matches = random.sample(list(similar_indices), group_size)
            #random_matches.sort()
            analogue_df.loc[len(analogue_df)] = [i, random_matches]
            if not overlap:
                used_indices.append(i)
                used_indices.extend(random_matches)
    
    analogue_df.set_index("query_id", inplace=True)
    return analogue_df


def select_analogue_groups_num_queries(similarity_matrix, num_queries = 10, group_size=30, sim_range=(0.8, 0.9999), no_overlap=True, seed=42, print_mean_similarity=False):
    """
    Selects groups of similar compounds based on a similarity matrix.
    Each group contains a specified number of compounds that are similar to a given compound within a defined similarity range.
    Parameters: 
        similarity_matrix (np.ndarray): A 2D numpy array where each row represents a compound and each column represents the similarity to other compounds.
        group_size (int): The number of similar compounds to select for each compound.
        sim_range (tuple): A tuple defining the lower and upper bounds of the similarity range to consider for selecting similar compounds.
        no_overlap (bool): If True, ensures that selected compounds do not overlap with previously selected compounds.
        seed (int): Random seed for reproducibility.
        print_mean_similarity (bool): If True, prints the mean similarity of the selected compounds for each compound.
        Returns:
            analogue_dict (dict): A dictionary where keys are indices of the original compounds and values are lists of indices of selected similar compounds.
    """
    random.seed(seed)
    analogue_df = pd.DataFrame(columns=["query_id", "analogue_ids"])
    used_indices =[]
    used_queries = []
    query_index = random.randint(0, len(similarity_matrix) - 1)
    for i in tqdm(range(num_queries), desc="Selecting analogue groups"):
        # Select a random query index from the similarity matrix
        while query_index in used_queries:
            query_index = random.randint(0, len(similarity_matrix) - 1)
        similar_indices = np.where((similarity_matrix[query_index] >= sim_range[0]) & (similarity_matrix[query_index] <= sim_range[1]))[0]
        # remove already used ids from similar_indices
        similar_indices = [idx for idx in similar_indices if idx not in used_indices]
        # check if group size is large enough
        if len(similar_indices) >= group_size:
            if print_mean_similarity:
                mean_similarity = np.mean(similarity_matrix[query_index][similar_indices])
                print(f"Index {query_index}: Found {len(similar_indices)} similar compounds with mean similarity {mean_similarity:.3f}. Picking {group_size} random matches.")
            
            random_matches = random.sample(list(similar_indices), group_size)
            analogue_df.loc[len(analogue_df)] = [query_index, random_matches]
            if no_overlap:
                used_indices.append(query_index)
                used_indices.extend(random_matches)
        used_queries.append(query_index)
    analogue_df.set_index("query_id", inplace=True)
    return analogue_df


def str_to_list_of_ints(series):
    return [int(x) for x in series.strip("[]").split(",") if x.strip().isdigit()]

def get_analogue_groups_file(group_size, sim_range, overlap, seed, morgan_radius, fpSize):
    df_file = os.path.join("data", "group_similarity", f"{filename}_analogue_group_size{group_size}_range{sim_range}_overlap{int(overlap)}_seed{seed}_morgan{morgan_radius}_{fpSize}bits.csv")
    try:
        analogue_df = pd.read_csv(df_file, index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"File {df_file} found. Loading analogue groups.")
    except FileNotFoundError:
        print(f"File {df_file} not found. Running the analogue group selection.")
        # analogue group selection
        analogue_df = select_analogue_groups(sim_matrix,
                                            group_size=group_size,
                                            sim_range=sim_range,
                                            overlap=overlap, 
                                            seed=seed,
                                            )#print_mean_similarity=True)
        analogue_df.to_csv(df_file)
        print(f"Analogue groups saved to {df_file}.")
    return analogue_df

def get_analogue_groups_num_queries_file(num_queries, group_size, sim_range, no_overlap, seed, morgan_radius, fpSize):
    df_file = os.path.join("data", "group_similarity", f"{filename}_analogue_groups_num_queries{num_queries}_size{group_size}_range{sim_range}_overlap{int(no_overlap)}_seed{seed}_morgan{morgan_radius}_{fpSize}bits.csv")
    try:
        analogue_df = pd.read_csv(df_file, index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"File {df_file} found. Loading analogue groups.")
    except FileNotFoundError:
        print(f"File {df_file} not found. Running the analogue group selection.")
        # analogue group selection
        analogue_df = select_analogue_groups_num_queries(sim_matrix,
                                                        num_queries=num_queries,
                                                        group_size=group_size,
                                                        sim_range=sim_range,
                                                        no_overlap=no_overlap, 
                                                        seed=seed,
                                                        )#print_mean_similarity=True)
        analogue_df.to_csv(df_file)
        print(f"Analogue groups saved to {df_file}.")
    return analogue_df

#### benchmark

In [16]:
from rdkit import DataStructs
from rdkit.Chem.Draw import SimilarityMaps
from ms_chemical_space_explorer.similarity_maps import generate_mol, get_similarity_map_weights, FingerprintFunction

def off_diagonal_mean(matrix):
    n = matrix.shape[0]
    # Create mask for off-diagonal elements
    mask = ~np.eye(n, dtype=bool)
    return matrix[mask].mean()

def get_benchmark_df(analogue_df, group_size, sim_range, overlap, seed, morgan_radius, fpSize):
    df_file = os.path.join("data", "group_similarity", f"{filename}_benchmark_group_size{group_size}_range{sim_range}_overlap{int(overlap)}_seed{seed}_morgan{morgan_radius}_{fpSize}bits")
    return get_benchmark_df_file(df_file, analogue_df, morgan_radius, fpSize)

def get_benchmark_df_num_queries(analogue_df, num_queries, group_size, sim_range, overlap, seed, morgan_radius, fpSize):
    df_file = os.path.join("data", "group_similarity", f"{filename}_benchmark_num_queries{num_queries}_size{group_size}_range{sim_range}_overlap{int(overlap)}_seed{seed}_morgan{morgan_radius}_{fpSize}bits")
    return get_benchmark_df_file(df_file, analogue_df, morgan_radius, fpSize)
    
def get_benchmark_df_file(df_file, analogue_df, morgan_radius, fpSize):
    try:
        analogue_df = pd.read_csv(df_file + ".csv", index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"File {df_file}.csv found. Loading benchmark dataframe.")
    except Exception as e:
        print(f"File {df_file}.csv not found. Running the benchmark dataframe generation.")
        # generate benchmark dataframe
        fp_function = FingerprintFunction()
        fp_function.fp_type = "count"
        fp_function.morgan_radius = morgan_radius
        fp_function.nbits = fpSize

        for query_id, row in tqdm(analogue_df.iterrows(), desc="Processing queries", total=len(analogue_df)):
            analogue_ids = [int(x) for x in row["analogue_ids"]]
            query = generate_mol(compounds.smiles[query_id])
            
            analogues = {}
            fp_query_sims = []
            stacked_mean_weights = []
            query_scaled_inverted_differnces = []
            for ref_id in tqdm(analogue_ids, desc=f"Processing analogues for query {query_id}", leave=False):
                # gather ruzicka similarity of analogue vs query
                fp_query_sims.append(sim_matrix[query_id][ref_id])
                
                # generate the reference molecule
                if ref_id in analogues:
                    ref_analogue = analogues[ref_id]
                else:
                    ref_analogue = generate_mol(compounds.smiles[ref_id])
                    analogues[ref_id] = ref_analogue
                
                # create array for stacked weights
                stacked_atomic_weights = [0] * ref_analogue.GetNumAtoms()
                
                # compute the group similarity map weights
                for probe_id in analogue_ids:#tqdm(analogue_ids, desc=f"Computing weights for weights for analogue {ref_id}", leave=False):
                    if ref_id == probe_id:
                        continue
                    
                    if probe_id in analogues:
                        probe_analogue = analogues[probe_id]
                    else:
                        probe_analogue = generate_mol(compounds.smiles[probe_id])
                        analogues[probe_id] = probe_analogue

                    # extract and standardize similarity weights for the reference molecule
                    ref_sim_weights = SimilarityMaps.GetAtomicWeightsForFingerprint(probe_analogue, ref_analogue, fp_function.get_function, metric=DataStructs.TanimotoSimilarity) #this is the bottle neck
                    #print(f"Getting weights took {(pd.Timestamp.now() - getting_weights_start).total_seconds():.4f} seconds")

                    # stack the weights
                    stacked_atomic_weights = [stacked_atomic_weights[i] + ref_sim_weights[i] for i in range(ref_analogue.GetNumAtoms())]
                # Standardize the stacked weights
                stacked_atomic_weights, _ = SimilarityMaps.GetStandardizedWeights(stacked_atomic_weights)
                stacked_mean_weights.append(sum(stacked_atomic_weights) / len(stacked_atomic_weights))
                
                # compute the similarity map weights against the query molecule
                query_atomic_weights = get_similarity_map_weights(ref_analogue, query, fp_function)
                
                # calculate absolute weight difference for each atom against the query
                query_abs_differences = ([abs(stacked_atomic_weights[i] - query_atomic_weights[i]) for i in range(len(stacked_atomic_weights))])
                query_mean_difference = sum(query_abs_differences) / len(query_abs_differences)
                
                # scale and invert the mean absolute difference to get a similarity score
                query_scaled_inverted_differnce = 1 - (query_mean_difference * 0.5)
                query_scaled_inverted_differnces.append(query_scaled_inverted_differnce)
            # --- gather all similarity metrics ---
            # scaled and inverted mean of the means of the absolute differences between atomic stacked weights and atomic query weights
            stacked_query_sim = (sum(query_scaled_inverted_differnces) / len(query_scaled_inverted_differnces))
            analogue_df.loc[query_id, "stacked_query_sim"] = stacked_query_sim
            
            # mean ruzicka similarity of analogue vs query ruzicka similaities
            analogue_df.loc[query_id, "fp_query_sim"] = np.mean(fp_query_sims)
            
            # mean of the means of stacked atomic weights of all-vs-all analogue similarity maps
            analogue_df.loc[query_id, "stacked_group_sim"] = np.mean(stacked_mean_weights)
            
            # mean of all-vs-all analogue ruzicka similarities
            fp_group_sim_matrix = ruzicka_similarity_matrix(fingerprints[analogue_ids], fingerprints[analogue_ids])
            fp_group_sim = off_diagonal_mean(fp_group_sim_matrix)
            analogue_df.loc[query_id, "fp_group_sim"] = fp_group_sim

        analogue_df.to_csv(df_file + ".csv")
        print(f"Benchmark dataframe saved to {df_file}.csv.")
    return analogue_df

In [None]:
num_queries = 5
group_size=10
sim_range=(0.1, 0.2)
no_overlap=True
seed=42

analogue_df = get_analogue_groups_num_queries_file(num_queries, group_size, sim_range, no_overlap, seed, morgan_radius, fpSize)
len(analogue_df)

File data\group_similarity\compounds_ms2structures_analogue_groups_num_queries5_size10_range(0.1, 0.2)_overlap0_seed42_morgan9_2048bits.csv found. Loading analogue groups.


5

In [None]:
analogue_df = get_benchmark_df_num_queries(analogue_df, num_queries, group_size, sim_range, no_overlap, seed, morgan_radius, fpSize)
analogue_df

File data\group_similarity\compounds_ms2structures_benchmark_num_queries5_size10_range(0.1, 0.2)_overlap0_seed42_morgan9_2048bits.csv not found. Running the benchmark dataframe generation.


Processing queries:   0%|          | 0/5 [00:00<?, ?it/s]

Processing analogues for query 7296:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35741:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 1739:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27696:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 22541:   0%|          | 0/10 [00:00<?, ?it/s]

Benchmark dataframe saved to data\group_similarity\compounds_ms2structures_benchmark_num_queries5_size10_range(0.1, 0.2)_overlap0_seed42_morgan9_2048bits.csv.


Unnamed: 0_level_0,analogue_ids,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7296,"[949, 28712, 10719, 9501, 8662, 5451, 28539, 4...",0.916738,0.143449,-0.475518,0.124242
35741,"[5171, 35009, 25048, 1892, 1772, 5545, 13056, ...",0.918728,0.147083,-0.483647,0.12102
1739,"[35236, 12609, 34267, 26375, 13915, 28290, 369...",0.894462,0.122567,-0.376407,0.132602
27696,"[24828, 20393, 11470, 15950, 24562, 7540, 6856...",0.923891,0.107677,-0.53909,0.101309
22541,"[25049, 11185, 33483, 1838, 30321, 19181, 2228...",0.898628,0.133528,-0.522693,0.123729


In [None]:
analogue_df = get_benchmark_df(analogue_df, group_size, sim_range, no_overlap, seed, morgan_radius, fpSize)
analogue_df

File data\group_similarity\compounds_ms2structures_benchmark_group_size10_range(0.1, 0.2)_overlap0_seed42_morgan9_2048bits.csv found. Loading benchmark dataframe.


Unnamed: 0_level_0,analogue_ids,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7296,"[949, 28712, 10719, 9501, 8662, 5451, 28539, 4...",0.916318,0.143449,-0.481109,0.124242
35741,"[5171, 35009, 25048, 1892, 1772, 5545, 13056, ...",0.918868,0.147083,-0.487147,0.121020
1739,"[35236, 12609, 34267, 26375, 13915, 28290, 369...",0.895109,0.122567,-0.377150,0.132602
27696,"[24828, 20393, 11470, 15950, 24562, 7540, 6856...",0.924902,0.107677,-0.544009,0.101309
22541,"[25049, 11185, 33483, 1838, 30321, 19181, 2228...",0.897688,0.133528,-0.528409,0.123729
...,...,...,...,...,...
896,"[28530, 20903, 4618, 11504, 5703, 23534, 5925,...",0.899586,0.109331,-0.514133,0.117840
19127,"[26003, 35842, 14049, 21341, 24705, 24163, 124...",0.859299,0.116730,-0.514481,0.120495
25138,"[9251, 27398, 23276, 33814, 6311, 3300, 12851,...",0.916937,0.131669,-0.315510,0.092830
22274,"[36702, 23699, 12639, 119, 13357, 33913, 14057...",0.935929,0.126454,-0.400113,0.145392


In [173]:
analogue_df.describe()

Unnamed: 0,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
count,100.0,100.0,100.0,100.0
mean,0.90126,0.127612,-0.441571,0.123923
std,0.039615,0.012183,0.105939,0.030249
min,0.759342,0.105874,-0.568626,0.088095
25%,0.885988,0.118745,-0.512669,0.106245
50%,0.911265,0.126157,-0.474373,0.11669
75%,0.928313,0.134953,-0.407153,0.127596
max,0.950685,0.163228,-0.056489,0.241263


In [None]:
analogue_df.describe()

Unnamed: 0,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
count,118.0,118.0,118.0,118.0
mean,0.971136,0.877939,0.452163,0.856637
std,0.009882,0.017742,0.048879,0.020949
min,0.937513,0.842886,0.381714,0.792859
25%,0.965328,0.864058,0.407028,0.844243
50%,0.973211,0.879951,0.454163,0.858812
75%,0.978411,0.892015,0.480395,0.872634
max,0.98507,0.909774,0.565702,0.904048
