## Similarity Matrix

In [150]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from rdkit.Chem import rdFingerprintGenerator

filename = "compounds_ms2structures"
path = os.path.join( "data", "datasets", filename + ".csv")
compounds = pd.read_csv(path)
compounds.head()

Unnamed: 0,inchikey,smiles,mass,cf_class,cf_subclass,cf_superclass,formula,npc_class_results,npc_pathway_results,npc_superclass_results
0,AAAQFGUYHFJNHI,CCNC(=O)C[C@H]1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N...,423.146204,Benzodiazepines,"1,4-benzodiazepines",Organoheterocyclic compounds,C22H22ClN5O2,,Alkaloids,
1,AABFWJDLCCDJJN,COC1=CC2=C(C=C1)NC3=C2C=CN=C3C4=CC=CC5=CC=CC=C54,324.126264,Harmala alkaloids,,Alkaloids and derivatives,C22H16N2O,Carboline alkaloids,Alkaloids,Tryptophan alkaloids
2,AABILZKQMVKFHP,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,427.220624,,,Alkaloids and derivatives,C21H33NO8,Pyrrolizidine alkaloids,Alkaloids,Ornithine alkaloids
3,AABUHSBGEIUSRJ,CC(=O)NC1=CC=C(C=C1)NC(=O)C=CC2=CC=CC=C2,280.120724,Cinnamic acids and derivatives,Cinnamic acid amides,Phenylpropanoids and polyketides,C17H16N2O2,Cinnamic acid amides,Shikimates and Phenylpropanoids,Phenylpropanoids (C6-C3)
4,AABUKWVVUWBZCS,C1=CC=C(C=C1)C2=C(C(=O)OC3=C2C=CC(=C3)O)C4=CC=...,314.094724,Neoflavonoids,Neoflavones,Phenylpropanoids and polyketides,C21H14O3,Neoflavonoids,Shikimates and Phenylpropanoids,Flavonoids


In [151]:
compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37811 entries, 0 to 37810
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   inchikey                37811 non-null  object 
 1   smiles                  37811 non-null  object 
 2   mass                    37811 non-null  float64
 3   cf_class                37685 non-null  object 
 4   cf_subclass             32652 non-null  object 
 5   cf_superclass           37810 non-null  object 
 6   formula                 37804 non-null  object 
 7   npc_class_results       24322 non-null  object 
 8   npc_pathway_results     33661 non-null  object 
 9   npc_superclass_results  25913 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.9+ MB


In [152]:
compounds.smiles[12]

'CCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)([O-])OCC[N+](C)(C)C)[C@@H](/C=C/CCCCCCCCCCCCC)O'

In [153]:
id_dict = {12: [36118,
  4964,
  1846,
  15575,
  13371,
  31051,
  6191,
  21350,
  3566,
  16668,
  26415,
  33912,
  25711,
  9389,
  35444],
 809: [2119,
  5949,
  20237,
  20602,
  23223,
  25548,
  1578,
  25294,
  7248,
  27739,
  25745,
  28949,
  32941,
  35992,
  9181],
 1578: [34429,
  13876,
  2274,
  28197,
  37231,
  3216,
  28047,
  6317,
  5671,
  4893,
  33154,
  2936,
  11227,
  35699,
  36397],
 3113: [26266,
  14563,
  25409,
  34396,
  19814,
  14897,
  37106,
  3193,
  18801,
  19710,
  3971,
  16334,
  29081,
  28459,
  12810],
 7630: [34874,
  30652,
  29799,
  18830,
  34437,
  11144,
  3761,
  1790,
  30883,
  27423,
  7320,
  25558,
  36876,
  14336,
  23933]}

In [154]:
# code taken from https://github.com/florian-huber/molecular_fingerprint_comparisons

import numba
from numba import prange
import numpy as np
from fingerprint_computation import FingerprintGenerator, compute_fingerprints_from_smiles

@numba.njit
def ruzicka_similarity(A, B):
    """
    Calculate the Ruzicka similarity between two count vectors.
    
    Parameters:
    A (array-like): First count vector.
    B (array-like): Second count vector.
    
    Returns:
    float: Ruzicka similarity.
    """
    
    min_sum = np.sum(np.minimum(A, B))
    max_sum = np.sum(np.maximum(A, B))
    
    return min_sum / max_sum


@numba.jit(nopython=True, fastmath=True, parallel=True)
def ruzicka_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
    """Returns matrix of Ruzicka similarity between all-vs-all vectors of references and queries.

    Parameters
    ----------
    references
        Reference vectors as 2D numpy array. Expects that vector_i corresponds to
        references[i, :].
    queries
        Query vectors as 2D numpy array. Expects that vector_i corresponds to
        queries[i, :].

    Returns
    -------
    scores
        Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
        between the vectors references[i, :] and queries[j, :].
    """
    assert references.shape[1] == queries.shape[1], "Vector sizes do not match!"

    size1 = references.shape[0]
    size2 = queries.shape[0]
    scores = np.zeros((size1, size2)) #, dtype=np.float32)
    for i in prange(size1):
        for j in range(size2):
            scores[i, j] = ruzicka_similarity(references[i, :], queries[j, :])
    return scores


def compute_similarity_matrix(compounds, sim_matrix_file, morgan_radius=9, fpSize=4096):
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=morgan_radius, fpSize=fpSize)
    fingerprints_morgan_count = compute_fingerprints_from_smiles(compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True)
    similarities_morgan_count = ruzicka_similarity_matrix(fingerprints_morgan_count, fingerprints_morgan_count)
    np.save(sim_matrix_file, similarities_morgan_count.astype(np.float32)) # big one ~5GB
    return np.load(sim_matrix_file, mmap_mode ='r')

In [155]:
morgan_radius=9
fpSize=2048

fingerprints_file = os.path.join("data", "group_similarity", f"{filename}_fingerprints_morgan{morgan_radius}_{fpSize}bits.npy")
try:
    fingerprints = np.load(fingerprints_file, mmap_mode ='r')
    print(f"File {fingerprints_file} found. Loading fingerprints.")
except FileNotFoundError:
    print(f"File {fingerprints_file} not found. Running the fingerprint generation.")
    # fingerprint generation
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=morgan_radius, fpSize=fpSize)
    fingerprints = compute_fingerprints_from_smiles(compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True)
    np.save(fingerprints_file, fingerprints.astype(np.float32))
    fingerprints = np.load(fingerprints_file, mmap_mode ='r')

# sim matrix generation
sim_matrix_file = os.path.join("data", "group_similarity", f"{filename}_ruzicka_similarities_morgan{morgan_radius}_{fpSize}bits.npy")
try:
    sim_matrix = np.load(sim_matrix_file, mmap_mode ='r')
    print(f"File {sim_matrix_file} found. Loading similarity matrix.")
except FileNotFoundError:
    print(f"File {sim_matrix_file} not found. Running the fingerprint & Similarity computation.")
    %time
    sim_matrix = compute_similarity_matrix(compounds, sim_matrix_file)

print(fingerprints.shape, sim_matrix.shape)

File data\group_similarity\compounds_ms2structures_fingerprints_morgan9_2048bits.npy found. Loading fingerprints.
File data\group_similarity\compounds_ms2structures_ruzicka_similarities_morgan9_2048bits.npy found. Loading similarity matrix.
(37811, 2048) (37811, 37811)


## Queries wih similiar groups

#### randomly select 30 analogues, if there are any

In [156]:
len(sim_matrix)

37811

In [157]:
row_index = 0
sim_range = (0.2, 0.9999)
similar_indices = np.where((sim_matrix[row_index] >= sim_range[0]) & (sim_matrix[row_index] <= sim_range[1]))[0]
len(similar_indices)

92

In [158]:
# how to create a dict in python
similarity_dict = {i: sim_matrix[row_index][i] for i in similar_indices}

In [159]:
sim_matrix[:10]

memmap([[1.        , 0.14537445, 0.06024097, ..., 0.14432989, 0.09448819,
         0.08896797],
        [0.14537445, 1.        , 0.03361345, ..., 0.18604651, 0.07438017,
         0.04727273],
        [0.06024097, 0.03361345, 1.        , ..., 0.04      , 0.10460251,
         0.13618676],
        ...,
        [0.05625   , 0.02731092, 0.11286682, ..., 0.025     , 0.06736842,
         0.11924686],
        [0.06586827, 0.0738255 , 0.01863354, ..., 0.07964602, 0.05952381,
         0.02487562],
        [0.03233831, 0.02319588, 0.06933333, ..., 0.01126761, 0.04010025,
         0.07263923]], dtype=float32)

#### check if similarities are computed correctly

In [None]:
len(np.where(sim_matrix[10000] >= 0.7)[0])

1

In [None]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=9)
all_fps = compounds.smiles.apply(lambda x: fpgen.GetCountFingerprint(Chem.MolFromSmiles(x))).tolist()
row_sims = Chem.DataStructs.BulkTanimotoSimilarity(all_fps[0], all_fps)
row_sims[:10]

[1.0,
 0.14537444933920704,
 0.060240963855421686,
 0.1291866028708134,
 0.13145539906103287,
 0.08205128205128205,
 0.12448132780082988,
 0.05625,
 0.0658682634730539,
 0.03233830845771144]

In [None]:
np.allclose(sim_matrix[0], row_sims)

True

In [None]:
all_close = []
for i in tqdm(range(len(compounds[:100]))):
    mol = Chem.MolFromSmiles(compounds.smiles[i])
    if mol is not None:
        fp = fpgen.GetCountFingerprint(mol)
        row_sims = Chem.DataStructs.BulkTanimotoSimilarity(fp, all_fps)
        all_close.append(np.allclose(sim_matrix[i], row_sims))
    else:
        print(f"Invalid SMILES at index {i}: {compounds.smiles[i]}")
        all_close.append(False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
np.array(all_close).sum()

100

#### benchmark

In [160]:
import pandas as pd
import random

def select_analogue_groups(similarity_matrix, group_size=30, sim_range=(0.8, 0.9999), overlap=False, seed=42, print_mean_similarity=False):
    """
    Selects groups of similar compounds based on a similarity matrix.
    Each group contains a specified number of compounds that are similar to a given compound within a defined similarity range.
    Parameters: 
        similarity_matrix (np.ndarray): A 2D numpy array where each row represents a compound and each column represents the similarity to other compounds.
        group_size (int): The number of similar compounds to select for each compound.
        sim_range (tuple): A tuple defining the lower and upper bounds of the similarity range to consider for selecting similar compounds.
        no_overlap (bool): If True, ensures that selected compounds do not overlap with previously selected compounds.
        seed (int): Random seed for reproducibility.
        print_mean_similarity (bool): If True, prints the mean similarity of the selected compounds for each compound.
        Returns:
            analogue_dict (dict): A dictionary where keys are indices of the original compounds and values are lists of indices of selected similar compounds.
    """
    random.seed(seed)
    analogue_df = pd.DataFrame(columns=["query_id", "analogue_ids"])
    used_indices =[]
    for i in tqdm(range(len(similarity_matrix)), desc="Selecting analogue groups"):
        similar_indices = np.where((similarity_matrix[i] >= sim_range[0]) & (similarity_matrix[i] <= sim_range[1]))[0]
        # remove already used ids from similar_indices
        similar_indices = [idx for idx in similar_indices if idx not in used_indices]
        # check if group size is large enough
        if len(similar_indices) >= group_size:
            if print_mean_similarity:
                mean_similarity = np.mean(similarity_matrix[i][similar_indices])
                print(f"Index {i}: Found {len(similar_indices)} similar compounds with mean similarity {mean_similarity:.3f}. Picking {group_size} random matches.")
            
            random_matches = random.sample(list(similar_indices), group_size)
            #random_matches.sort()
            analogue_df.loc[len(analogue_df)] = [i, random_matches]
            if not overlap:
                used_indices.append(i)
                used_indices.extend(random_matches)
    
    analogue_df.set_index("query_id", inplace=True)
    return analogue_df


def str_to_list_of_ints(series):
    return [int(x) for x in series.strip("[]").split(",") if x.strip().isdigit()]

def get_analogue_groups_file(group_size, sim_range, overlap, seed, morgan_radius, fpSize):
    df_file = os.path.join("data", "group_similarity", f"{filename}_analogue_group_size{group_size}_range{sim_range}_overlap{int(overlap)}_seed{seed}_morgan{morgan_radius}_{fpSize}bits.csv")
    try:
        analogue_df = pd.read_csv(df_file, index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"File {df_file} found. Loading analogue groups.")
    except FileNotFoundError:
        print(f"File {df_file} not found. Running the analogue group selection.")
        # analogue group selection
        analogue_df = select_analogue_groups(sim_matrix,
                                            group_size=group_size,
                                            sim_range=sim_range,
                                            overlap=overlap, 
                                            seed=seed,
                                            )#print_mean_similarity=True)
        analogue_df.to_csv(df_file)
        print(f"Analogue groups saved to {df_file}.")
    return analogue_df

In [None]:
from rdkit.Chem.Draw import SimilarityMaps
from ms_chemical_space_explorer.similarity_maps import generate_mol, get_similarity_map_weights, FingerprintFunction

def off_diagonal_mean(matrix):
    n = matrix.shape[0]
    # Create mask for off-diagonal elements
    mask = ~np.eye(n, dtype=bool)
    return matrix[mask].mean()

def get_benchmark_df(analogue_df, compounds, group_size, sim_range, overlap, seed, morgan_radius, fpSize):
    df_file = os.path.join("data", "group_similarity", f"{filename}_benchmark_group_size{group_size}_range{sim_range}_overlap{int(overlap)}_seed{seed}_morgan{morgan_radius}_{fpSize}bits")
    try:
        analogue_df = pd.read_csv(df_file + ".csv", index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"File {df_file}.csv found. Loading benchmark dataframe.")
    except Exception as e:
        print(f"File {df_file}.csv not found. Running the benchmark dataframe generation.")
        # generate benchmark dataframe
        fp_function = FingerprintFunction()
        fp_function.fp_type = "count"
        fp_function.morgan_radius = morgan_radius
        fp_function.nbits = fpSize

        for query_id, row in tqdm(analogue_df.iterrows(), desc="Processing queries", total=len(analogue_df)):
            analogue_ids = [int(x) for x in row["analogue_ids"]]
            query = generate_mol(compounds.smiles[query_id])
            
            analogues = {}
            fp_query_sims = []
            stacked_mean_weights = []
            query_scaled_inverted_differnces = []
            for ref_id in tqdm(analogue_ids, desc=f"Processing analogues for query {query_id}", leave=False):
                # gather ruzicka similarity of analogue vs query
                fp_query_sims.append(sim_matrix[query_id][ref_id])
                
                # generate the reference molecule
                if ref_id in analogues:
                    ref_analogue = analogues[ref_id]
                else:
                    ref_analogue = generate_mol(compounds.smiles[ref_id])
                    analogues[ref_id] = ref_analogue
                
                # create array for stacked weights
                stacked_atomic_weights = [0] * ref_analogue.GetNumAtoms()
                
                # compute the group similarity map weights
                for probe_id in analogue_ids:#tqdm(analogue_ids, desc=f"Computing weights for weights for analogue {ref_id}", leave=False):
                    if ref_id == probe_id:
                        continue
                    
                    if probe_id in analogues:
                        probe_analogue = analogues[probe_id]
                    else:
                        probe_analogue = generate_mol(compounds.smiles[probe_id])
                        analogues[probe_id] = probe_analogue

                    # extract and standardize similarity weights for the reference molecule
                    ref_sim_weights = SimilarityMaps.GetAtomicWeightsForFingerprint(probe_analogue, ref_analogue, fp_function.get_function) #this is the bottle neck
                    #print(f"Getting weights took {(pd.Timestamp.now() - getting_weights_start).total_seconds():.4f} seconds")

                    # stack the weights
                    stacked_atomic_weights = [stacked_atomic_weights[i] + ref_sim_weights[i] for i in range(ref_analogue.GetNumAtoms())]
                # Standardize the stacked weights
                stacked_atomic_weights, _ = SimilarityMaps.GetStandardizedWeights(stacked_atomic_weights)
                stacked_mean_weights.append(sum(stacked_atomic_weights) / len(stacked_atomic_weights))
                
                # compute the similarity map weights against the query molecule
                query_atomic_weights = get_similarity_map_weights(ref_analogue, query, fp_function)
                
                # calculate absolute weight difference for each atom against the query
                query_abs_differences = ([abs(stacked_atomic_weights[i] - query_atomic_weights[i]) for i in range(len(stacked_atomic_weights))])
                query_mean_difference = sum(query_abs_differences) / len(query_abs_differences)
                
                # scale and invert the mean absolute difference to get a similarity score
                query_scaled_inverted_differnce = 1 - (query_mean_difference * 0.5)
                query_scaled_inverted_differnces.append(query_scaled_inverted_differnce)
            # --- gather all similarity metrics ---
            # scaled and inverted mean of the means of the absolute differences between atomic stacked weights and atomic query weights
            stacked_query_sim = (sum(query_scaled_inverted_differnces) / len(query_scaled_inverted_differnces))
            analogue_df.loc[query_id, "stacked_query_sim"] = stacked_query_sim
            
            # mean ruzicka similarity of analogue vs query ruzicka similaities
            analogue_df.loc[query_id, "fp_query_sim"] = np.mean(fp_query_sims)
            
            # mean of the means of stacked atomic weights of all-vs-all analogue similarity maps
            analogue_df.loc[query_id, "stacked_group_sim"] = np.mean(stacked_mean_weights)
            
            # mean of all-vs-all analogue ruzicka similarities
            fp_group_sim_matrix = ruzicka_similarity_matrix(fingerprints[analogue_ids], fingerprints[analogue_ids])
            fp_group_sim = off_diagonal_mean(fp_group_sim_matrix)
            analogue_df.loc[query_id, "fp_group_sim"] = fp_group_sim

        analogue_df.to_csv(df_file + ".csv")
        print(f"Benchmark dataframe saved to {df_file}.csv.")
    return analogue_df

In [None]:
group_size=8
sim_range=(0.5, 0.51)
overlap=False
seed=42

analogue_df = get_analogue_groups_file(group_size, sim_range, overlap, seed, morgan_radius, fpSize)
len(analogue_df)

File data\group_similarity\compounds_ms2structures_analogue_group_size8_range(0.5, 0.51)_overlap0_seed42_morgan9_2048bits.csv not found. Running the analogue group selection.


Selecting analogue groups:   0%|          | 0/37811 [00:00<?, ?it/s]

Analogue groups saved to data\group_similarity\compounds_ms2structures_analogue_group_size8_range(0.5, 0.51)_overlap0_seed42_morgan9_2048bits.csv.


Unnamed: 0_level_0,analogue_ids
query_id,Unnamed: 1_level_1
129,"[35187, 5453, 4909, 11692, 10720, 35941, 17521..."
182,"[33137, 3922, 24681, 18256, 2731, 27126, 26173..."
213,"[6221, 24010, 24991, 4226, 35006, 36666, 20365..."
253,"[25772, 31872, 14249, 570, 9317, 14495, 10572,..."
263,"[18669, 11214, 17449, 18997, 6681, 20763, 3065..."
...,...
32897,"[28363, 32519, 2526, 33051, 28725, 16869, 4698..."
35211,"[36288, 19156, 15813, 37725, 23564, 31179, 375..."
35699,"[8413, 26312, 22450, 26735, 29954, 11664, 1566..."
35900,"[16428, 32394, 14617, 37542, 8357, 10231, 3675..."


In [167]:
analogue_df = get_benchmark_df(analogue_df, compounds, group_size, sim_range, overlap, seed, morgan_radius, fpSize)
analogue_df

File data\group_similarity\compounds_ms2structures_benchmark_group_size8_range(0.5, 0.51)_overlap0_seed42_morgan9_2048bits.csv not found. Running the benchmark dataframe generation.


Processing queries:   0%|          | 0/210 [00:00<?, ?it/s]

Processing analogues for query 129:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 182:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 213:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 253:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 263:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 273:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 290:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 312:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 355:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 367:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 425:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 435:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 570:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 586:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 607:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 626:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 691:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 710:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 714:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 719:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 721:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 788:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 810:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 817:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 829:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 836:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 856:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 913:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 968:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1017:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1040:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1051:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1055:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1105:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1109:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1166:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1180:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1181:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1190:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1254:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1269:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1284:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1295:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1304:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1383:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1403:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1417:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1426:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1461:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1544:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1588:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1595:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1649:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1671:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1686:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1778:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1790:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1800:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1868:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1921:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 1979:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2005:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2053:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2055:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2073:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2113:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2128:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2165:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2171:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2214:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2274:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2292:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2311:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2326:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2379:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2397:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2398:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2417:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2504:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2529:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2544:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2604:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2629:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2722:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2822:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2938:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2970:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2977:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 2985:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3009:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3020:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3039:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3044:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3068:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3102:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3113:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3264:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3338:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3513:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3609:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3664:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3682:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3769:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3829:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3867:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3873:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3935:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3966:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 3968:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4038:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4110:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4226:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4227:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4383:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4401:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4502:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4511:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4524:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4573:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4744:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4838:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4878:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4911:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 4921:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5039:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5057:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5291:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5460:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5578:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5636:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5640:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5739:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 5922:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6303:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6360:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6420:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6469:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6628:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6654:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6721:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 6738:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7122:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7174:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7408:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7432:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7740:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7843:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7855:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 7947:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 8033:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 8086:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 8261:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 8487:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 8507:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 9181:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 9462:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 9978:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 10215:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 10389:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 10598:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 10800:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 11164:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 11371:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 11495:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 11500:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 11788:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 11844:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12002:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12110:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12236:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12546:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12668:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12824:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12826:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 12848:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 13134:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 13592:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 13644:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 13939:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 15318:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 15551:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 16554:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 16931:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 17351:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 17743:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 18830:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 19002:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 19152:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 19870:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 20686:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 21519:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 21598:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 21793:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 22693:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 27407:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 27588:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 27653:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 27667:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 28384:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 30273:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 30493:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 30889:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 30979:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 31023:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 32529:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 32897:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 35211:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 35699:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 35900:   0%|          | 0/8 [00:00<?, ?it/s]

Processing analogues for query 37589:   0%|          | 0/8 [00:00<?, ?it/s]

Benchmark dataframe saved to data\group_similarity\compounds_ms2structures_benchmark_group_size8_range(0.5, 0.51)_overlap0_seed42_morgan9_2048bits.csv.


Unnamed: 0_level_0,analogue_ids,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
129,"[35187, 5453, 4909, 11692, 10720, 35941, 17521...",0.885183,0.504370,0.350718,0.496337
182,"[33137, 3922, 24681, 18256, 2731, 27126, 26173...",0.884319,0.503652,0.244808,0.457702
213,"[6221, 24010, 24991, 4226, 35006, 36666, 20365...",0.910941,0.504126,0.225915,0.439978
253,"[25772, 31872, 14249, 570, 9317, 14495, 10572,...",0.934316,0.506322,0.273485,0.454707
263,"[18669, 11214, 17449, 18997, 6681, 20763, 3065...",0.942236,0.505779,0.233329,0.423703
...,...,...,...,...,...
32897,"[28363, 32519, 2526, 33051, 28725, 16869, 4698...",0.813012,0.505376,0.467100,0.496403
35211,"[36288, 19156, 15813, 37725, 23564, 31179, 375...",0.931206,0.504794,0.313162,0.528711
35699,"[8413, 26312, 22450, 26735, 29954, 11664, 1566...",0.844130,0.505104,0.271795,0.472649
35900,"[16428, 32394, 14617, 37542, 8357, 10231, 3675...",0.873754,0.503825,0.331287,0.485041


In [168]:
analogue_df.describe()

Unnamed: 0,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
count,210.0,210.0,210.0,210.0
mean,0.903018,0.504637,0.240266,0.463734
std,0.039237,0.001093,0.083252,0.051868
min,0.711558,0.501537,-0.095535,0.347549
25%,0.883658,0.503858,0.186916,0.433595
50%,0.909787,0.504747,0.242064,0.457948
75%,0.926746,0.5054,0.296945,0.487403
max,0.99062,0.507075,0.4671,0.748284


In [None]:
analogue_df.describe()

Unnamed: 0,stacked_query_sim,fp_query_sim,stacked_group_sim,fp_group_sim
count,118.0,118.0,118.0,118.0
mean,0.971136,0.877939,0.452163,0.856637
std,0.009882,0.017742,0.048879,0.020949
min,0.937513,0.842886,0.381714,0.792859
25%,0.965328,0.864058,0.407028,0.844243
50%,0.973211,0.879951,0.454163,0.858812
75%,0.978411,0.892015,0.480395,0.872634
max,0.98507,0.909774,0.565702,0.904048
