In [None]:
# dataset
FILENAME = "compounds_ms2structures"

# similarity calculation
MORGAN_RADIUS=9
FINGERPRINT_BITS=2048

# analogue selection
NUM_QUERIES = 1000
GROUP_SIZE=10
SIMILARITY_RANGE=(0.7, 0.9999)
NO_OVERLAP=False
RANDOM_SELECT = True   # Set to false to iterate through the dataset sequentially. Might be be better for high similarity selections.
SEED=42

## Init

In [220]:
import os
import random
import pandas as pd
import numpy as np
import numba
from numba import prange
from tqdm.notebook import tqdm

from rdkit import DataStructs
from rdkit.Chem.Draw import SimilarityMaps
from rdkit.Chem import rdFingerprintGenerator, SmilesMolSupplier

from fingerprint_computation import FingerprintGenerator, compute_fingerprints_from_smiles
from ms_chemical_space_explorer.similarity_maps import generate_mol, get_similarity_map_weights, FingerprintFunction

path = os.path.join( "data", "datasets", FILENAME + ".csv")
fingerprints_file = os.path.join("data", "group_similarity", f"{FILENAME}_fingerprints_morgan{MORGAN_RADIUS}_{FINGERPRINT_BITS}bits.npy")
sim_matrix_file = os.path.join("data", "group_similarity", f"{FILENAME}_ruzicka_similarities_morgan{MORGAN_RADIUS}_{FINGERPRINT_BITS}bits.npy")
df_file = os.path.join("data", "group_similarity", "benchmarks", f"{FILENAME}_benchmark_num_queries{NUM_QUERIES}_random{int(RANDOM_SELECT)}_size{GROUP_SIZE}_range{SIMILARITY_RANGE}_no_overlap{int(NO_OVERLAP)}_seed{SEED}_morgan{MORGAN_RADIUS}_{FINGERPRINT_BITS}bits.csv")

# if FILENAME == "compounds_ms2structures":
#     supplier = SmilesMolSupplier(path, delimiter=',', titleLine=True, smilesColumn=1, nameColumn=0)
# elif FILENAME == "biostructures_combined":
#     supplier = SmilesMolSupplier(path, titleLine=True, nameColumn=0)

compounds = pd.read_csv(path)
compounds.head()

Unnamed: 0,inchikey,smiles,mass,cf_class,cf_subclass,cf_superclass,formula,npc_class_results,npc_pathway_results,npc_superclass_results
0,AAAQFGUYHFJNHI,CCNC(=O)C[C@H]1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N...,423.146204,Benzodiazepines,"1,4-benzodiazepines",Organoheterocyclic compounds,C22H22ClN5O2,,Alkaloids,
1,AABFWJDLCCDJJN,COC1=CC2=C(C=C1)NC3=C2C=CN=C3C4=CC=CC5=CC=CC=C54,324.126264,Harmala alkaloids,,Alkaloids and derivatives,C22H16N2O,Carboline alkaloids,Alkaloids,Tryptophan alkaloids
2,AABILZKQMVKFHP,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,427.220624,,,Alkaloids and derivatives,C21H33NO8,Pyrrolizidine alkaloids,Alkaloids,Ornithine alkaloids
3,AABUHSBGEIUSRJ,CC(=O)NC1=CC=C(C=C1)NC(=O)C=CC2=CC=CC=C2,280.120724,Cinnamic acids and derivatives,Cinnamic acid amides,Phenylpropanoids and polyketides,C17H16N2O2,Cinnamic acid amides,Shikimates and Phenylpropanoids,Phenylpropanoids (C6-C3)
4,AABUKWVVUWBZCS,C1=CC=C(C=C1)C2=C(C(=O)OC3=C2C=CC(=C3)O)C4=CC=...,314.094724,Neoflavonoids,Neoflavones,Phenylpropanoids and polyketides,C21H14O3,Neoflavonoids,Shikimates and Phenylpropanoids,Flavonoids


In [221]:
compounds.smiles[12]

'CCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](COP(=O)([O-])OCC[N+](C)(C)C)[C@@H](/C=C/CCCCCCCCCCCCC)O'

## Fingerprints & Similarities

In [222]:
# code taken from https://github.com/florian-huber/molecular_fingerprint_comparisons
@numba.njit
def ruzicka_similarity(A, B):
    """
    Calculate the Ruzicka similarity between two count vectors.
    
    Parameters:
    A (array-like): First count vector.
    B (array-like): Second count vector.
    
    Returns:
    float: Ruzicka similarity.
    """
    
    min_sum = np.sum(np.minimum(A, B))
    max_sum = np.sum(np.maximum(A, B))
    
    return min_sum / max_sum


@numba.jit(nopython=True, fastmath=True, parallel=True)
def ruzicka_similarity_matrix(references: np.ndarray, queries: np.ndarray) -> np.ndarray:
    """Returns matrix of Ruzicka similarity between all-vs-all vectors of references and queries.

    Parameters
    ----------
    references
        Reference vectors as 2D numpy array. Expects that vector_i corresponds to
        references[i, :].
    queries
        Query vectors as 2D numpy array. Expects that vector_i corresponds to
        queries[i, :].

    Returns
    -------
    scores
        Matrix of all-vs-all similarity scores. scores[i, j] will contain the score
        between the vectors references[i, :] and queries[j, :].
    """
    assert references.shape[1] == queries.shape[1], "Vector sizes do not match!"

    size1 = references.shape[0]
    size2 = queries.shape[0]
    scores = np.zeros((size1, size2)) #, dtype=np.float32)
    for i in prange(size1):
        for j in range(size2):
            scores[i, j] = ruzicka_similarity(references[i, :], queries[j, :])
    return scores


def compute_similarity_matrix(fingerprints, sim_matrix_file):
    similarities_morgan_count = ruzicka_similarity_matrix(fingerprints, fingerprints)
    np.save(sim_matrix_file, similarities_morgan_count.astype(np.float32)) # big one ~5GB
    return np.load(sim_matrix_file, mmap_mode ='r')

In [223]:
try:
    fingerprints = np.load(fingerprints_file, mmap_mode ='r')
    print(f"File {fingerprints_file} found. Loading fingerprints.")
except FileNotFoundError:
    print(f"File {fingerprints_file} not found. Running the fingerprint generation.")
    # fingerprint generation
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=MORGAN_RADIUS, fpSize=FINGERPRINT_BITS)
    fingerprints = compute_fingerprints_from_smiles(compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True)
    np.save(fingerprints_file, fingerprints.astype(np.float32))
    fingerprints = np.load(fingerprints_file, mmap_mode ='r')

# sim matrix generation
try:
    sim_matrix = np.load(sim_matrix_file, mmap_mode ='r')
    print(f"File {sim_matrix_file} found. Loading similarity matrix.")
except FileNotFoundError:
    print(f"File {sim_matrix_file} not found. Running the fingerprint & Similarity computation.")
    %time
    sim_matrix = compute_similarity_matrix(fingerprints, sim_matrix_file)

print(fingerprints.shape, sim_matrix.shape)

File data\group_similarity\compounds_ms2structures_fingerprints_morgan9_2048bits.npy found. Loading fingerprints.
File data\group_similarity\compounds_ms2structures_ruzicka_similarities_morgan9_2048bits.npy found. Loading similarity matrix.
(37811, 2048) (37811, 37811)


In [224]:
sim_matrix[:10]

memmap([[1.        , 0.14537445, 0.06024097, ..., 0.14432989, 0.09448819,
         0.08896797],
        [0.14537445, 1.        , 0.03361345, ..., 0.18604651, 0.07438017,
         0.04727273],
        [0.06024097, 0.03361345, 1.        , ..., 0.04      , 0.10460251,
         0.13618676],
        ...,
        [0.05625   , 0.02731092, 0.11286682, ..., 0.025     , 0.06736842,
         0.11924686],
        [0.06586827, 0.0738255 , 0.01863354, ..., 0.07964602, 0.05952381,
         0.02487562],
        [0.03233831, 0.02319588, 0.06933333, ..., 0.01126761, 0.04010025,
         0.07263923]], dtype=float32)

## Analogue Selection

In [None]:
def select_analogue_groups(similarity_matrix, num_queries=10, random_select=False, group_size=30, sim_range=(0.8, 0.9999), no_overlap=True, seed=42, print_mean_similarity=False):
    random.seed(seed)
    analogue_df = pd.DataFrame(columns=["query_id", "analogue_ids"])
    used_indices =[]
    used_queries = []
    
    if random_select:
        query_index = random.randint(0, len(similarity_matrix) - 1)
    else:
        query_index = 0
    
    for i in tqdm(range(len(similarity_matrix)), desc="Selecting analogue groups"):
        # Select a random query index from the similarity matrix

        while random_select and query_index in used_queries:
            query_index = random.randint(0, len(similarity_matrix) - 1)
        
        if not random_select:
            query_index = i
        
        similar_indices = np.nonzero((similarity_matrix[query_index] >= sim_range[0]) & (similarity_matrix[query_index] <= sim_range[1]))[0]
        # remove already used ids from similar_indices
        similar_indices = [idx for idx in similar_indices if idx not in used_indices]
        # check if group size is large enough
        if len(similar_indices) >= group_size:
            if print_mean_similarity:
                mean_similarity = np.mean(similarity_matrix[query_index][similar_indices])
                print(f"Index {query_index}: Found {len(similar_indices)} similar compounds with mean similarity {mean_similarity:.3f}. Picking {group_size} random matches.")
            
            random_matches = random.sample(list(similar_indices), group_size)
            analogue_df.loc[len(analogue_df)] = [query_index, random_matches]
            if no_overlap:
                used_indices.append(query_index)
                used_indices.extend(random_matches)
        
        used_queries.append(query_index)
        
        if len(analogue_df) >= num_queries:
            break
        
    analogue_df.set_index("query_id", inplace=True)
    if len(analogue_df) < num_queries:
        print(f"Found only {len(analogue_df)} analogue groups for given parameters:\nnum_queries={num_queries}, random_select={random_select}, group_size={group_size}, sim_range={sim_range}, no_overlap={no_overlap}.")
    return analogue_df


def str_to_list_of_ints(series):
    return [int(x) for x in series.strip("[]").split(",") if x.strip().isdigit()]


def get_analogue_groups_df(df_file, num_queries, random_select, group_size, sim_range, no_overlap, seed, morgan_radius, fpSize):
    try:
        analogue_df = pd.read_csv(df_file, index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"Found {df_file}\nLoaded {len(analogue_df)} analogue groups.")
    except FileNotFoundError:
        print(f"File not found: {df_file}\nRunning the analogue group selection.")
        # analogue group selection
        analogue_df = select_analogue_groups(sim_matrix,
                                             num_queries=num_queries,
                                             random_select=random_select,
                                             group_size=group_size,
                                             sim_range=sim_range,
                                             no_overlap=no_overlap, 
                                             seed=seed,
                                            )#print_mean_similarity=True)
        analogue_df.to_csv(df_file)
        print(f"{len(analogue_df)} analogue groups saved to {df_file}.")
    return analogue_df

In [226]:
analogue_df = get_analogue_groups_df(df_file, NUM_QUERIES, RANDOM_SELECT, GROUP_SIZE, SIMILARITY_RANGE, NO_OVERLAP, SEED, MORGAN_RADIUS, FINGERPRINT_BITS)
len(analogue_df)

File not found: data\group_similarity\benchmarks\compounds_ms2structures_benchmark_num_queries1000_random1_size10_range(0.7, 0.9999)_no_overlap0_seed42_morgan9_2048bits.csv
Running the analogue group selection.


Selecting analogue groups:   0%|          | 0/37811 [00:00<?, ?it/s]

1000 analogue groups saved to data\group_similarity\benchmarks\compounds_ms2structures_benchmark_num_queries1000_random1_size10_range(0.7, 0.9999)_no_overlap0_seed42_morgan9_2048bits.csv.


1000

## Benchmark

In [227]:
def off_diagonal_mean(matrix):
    n = matrix.shape[0]
    # Create mask for off-diagonal elements
    mask = ~np.eye(n, dtype=bool)
    return matrix[mask].mean()


def get_benchmark_df_file(df_file, analogue_df, morgan_radius, fpSize):
    try:
        analogue_df = pd.read_csv(df_file, index_col=0)
        analogue_df["analogue_ids"] = analogue_df["analogue_ids"].apply(str_to_list_of_ints)
        print(f"Found {df_file}\nTrying to access benchmark columns.\nExisting columns: {analogue_df.columns.tolist()}")
        query_sim= analogue_df[:1].query_sim
        group_sim=analogue_df[:1].group_sim
        sgs=analogue_df[:1].sgs
        sgs_score=analogue_df[:1].sgs_score
        print(f"Benchmark found! Loaded benchmark for {len(analogue_df)} queries.")
    except Exception as e:
        print(f"Benchmark not found: {df_file}\nRunning the benchmark dataframe generation.")
        # generate benchmark dataframe
        fp_function = FingerprintFunction(fingerprint="MORGAN")
        fp_function.fp_type = "count"
        fp_function.morgan_radius = morgan_radius
        fp_function.nbits = fpSize

        for query_id, row in tqdm(analogue_df.iterrows(), desc="Processing queries", total=len(analogue_df)):
            analogue_ids = [int(x) for x in row["analogue_ids"]]
            query = generate_mol(compounds.smiles[query_id])
            
            analogues = {}
            query_sims = []
            stacked_mean_weights = []
            query_scaled_inverted_differnces = []
            for ref_id in tqdm(analogue_ids, desc=f"Processing analogues for query {query_id}", leave=False):
                # gather ruzicka similarity of analogue vs query
                query_sims.append(sim_matrix[query_id][ref_id])
                
                # generate the reference molecule
                if ref_id in analogues:
                    ref_analogue = analogues[ref_id]
                else:
                    ref_analogue = generate_mol(compounds.smiles[ref_id])
                    analogues[ref_id] = ref_analogue
                
                # create array for stacked weights
                stacked_atomic_weights = [0] * ref_analogue.GetNumAtoms()
                
                # compute the group similarity map weights
                for probe_id in analogue_ids:#tqdm(analogue_ids, desc=f"Computing weights for weights for analogue {ref_id}", leave=False):
                    if ref_id == probe_id:
                        continue
                    
                    if probe_id in analogues:
                        probe_analogue = analogues[probe_id]
                    else:
                        probe_analogue = generate_mol(compounds.smiles[probe_id])
                        analogues[probe_id] = probe_analogue

                    # extract and standardize similarity weights for the reference molecule
                    ref_sim_weights = SimilarityMaps.GetAtomicWeightsForFingerprint(probe_analogue, ref_analogue, fp_function.get_function, metric=DataStructs.TanimotoSimilarity) #this is the bottle neck
                    #print(f"Getting weights took {(pd.Timestamp.now() - getting_weights_start).total_seconds():.4f} seconds")

                    # stack the weights
                    stacked_atomic_weights = [stacked_atomic_weights[i] + ref_sim_weights[i] for i in range(ref_analogue.GetNumAtoms())]
                # Standardize the stacked weights
                stacked_atomic_weights, _ = SimilarityMaps.GetStandardizedWeights(stacked_atomic_weights)
                stacked_mean_weights.append(sum(stacked_atomic_weights) / len(stacked_atomic_weights))
                
                # compute the similarity map weights against the query molecule
                query_atomic_weights = get_similarity_map_weights(ref_analogue, query, fp_function)
                
                # calculate absolute weight difference for each atom against the query
                query_abs_differences = ([abs(stacked_atomic_weights[i] - query_atomic_weights[i]) for i in range(len(stacked_atomic_weights))])
                query_mean_difference = sum(query_abs_differences) / len(query_abs_differences)
                
                # scale and invert the mean absolute difference to get a similarity score
                query_scaled_inverted_differnce = 1 - (query_mean_difference * 0.5)
                query_scaled_inverted_differnces.append(query_scaled_inverted_differnce)
            # --- gather all similarity metrics ---
            # mean ruzicka similarity of analogue vs query ruzicka similaities
            analogue_df.loc[query_id, "query_sim"] = np.mean(query_sims)
            
            # mean of all-vs-all analogue ruzicka similarities
            fp_group_sim_matrix = ruzicka_similarity_matrix(fingerprints[analogue_ids], fingerprints[analogue_ids])
            group_sim = off_diagonal_mean(fp_group_sim_matrix)
            analogue_df.loc[query_id, "group_sim"] = group_sim
            
            # mean of the means of stacked atomic weights of all-vs-all analogue similarity maps
            unscaled_sgs = np.mean(stacked_mean_weights)                # range [-1, 1]
            analogue_df.loc[query_id, "sgs"] = (unscaled_sgs + 1) / 2   # range [0, 1]
            
            # scaled and inverted mean of the means of the absolute differences between atomic stacked weights and atomic query weights
            sgs_score = (sum(query_scaled_inverted_differnces) / len(query_scaled_inverted_differnces))
            analogue_df.loc[query_id, "sgs_score"] = sgs_score

        analogue_df.to_csv(df_file)
        print(f"Saved {len(analogue_df)} queries to {df_file}.")
    return analogue_df

In [228]:
analogue_df = get_benchmark_df_file(df_file, analogue_df, MORGAN_RADIUS, FINGERPRINT_BITS)
analogue_df

Found data\group_similarity\benchmarks\compounds_ms2structures_benchmark_num_queries1000_random1_size10_range(0.7, 0.9999)_no_overlap0_seed42_morgan9_2048bits.csv
Trying to access benchmark columns.
Existing columns: ['analogue_ids']
Benchmark not found: data\group_similarity\benchmarks\compounds_ms2structures_benchmark_num_queries1000_random1_size10_range(0.7, 0.9999)_no_overlap0_seed42_morgan9_2048bits.csv
Running the benchmark dataframe generation.


Processing queries:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing analogues for query 35713:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 6698:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 23723:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 23219:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7161:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 2604:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 22471:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 11702:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16693:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 12572:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 13774:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 15155:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25558:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 29799:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 1335:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16931:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 26502:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 12552:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 3367:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 37303:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16408:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28981:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7597:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 17919:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 23492:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28949:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 8963:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7547:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30426:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 26092:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 9388:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 4862:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24435:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 6628:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30256:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 6793:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 8837:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 32519:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 26621:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 23651:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 33905:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 20926:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25175:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 36035:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 19647:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 34356:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 34437:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 607:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27252:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 1426:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25066:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 6612:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35818:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 34366:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 443:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21357:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27413:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 36568:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35593:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 34883:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 29798:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 17958:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 5976:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 13962:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 13464:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 5401:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 15639:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 3009:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 13607:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7959:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27961:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 23047:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 19327:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 3252:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24315:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35095:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 9806:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21724:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 23617:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 26027:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 12344:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 36585:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7085:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27582:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 3012:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 31583:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21013:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 29110:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 29898:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7062:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 17930:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 10145:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21696:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25288:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16509:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24453:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 13876:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 20032:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 33053:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24273:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 34063:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 10603:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 11486:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7668:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 14442:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 9558:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35287:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 10938:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25548:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 36961:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 26983:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21643:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 3935:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 15649:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16773:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 10932:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28219:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 12:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 37387:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 15880:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 17778:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16213:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25745:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 14829:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35137:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 16389:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27497:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24154:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7355:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 13143:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 4029:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28292:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 12176:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30470:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 35063:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 11849:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30122:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 4720:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27002:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24610:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 36749:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30007:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 32729:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 27493:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 8975:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 5949:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 20447:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 1947:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28207:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 20477:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28059:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 32568:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 14803:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25555:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 31120:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 19491:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 34461:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 28266:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 22228:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21293:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 17595:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 31300:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30335:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30157:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 21879:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 22133:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 3058:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 33154:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 33904:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 19527:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 9173:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 24221:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 22139:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 9439:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 29135:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 25957:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 4263:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 2185:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 33309:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 30238:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 6836:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 7257:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 4192:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 15708:   0%|          | 0/10 [00:00<?, ?it/s]

Processing analogues for query 36834:   0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
analogue_df.describe()

Unnamed: 0,query_sim,group_sim,sgs,sgs_score
count,183.0,183.0,183.0,183.0
mean,0.780751,0.724517,0.7358,0.956671
std,0.037106,0.053803,0.034078,0.018254
min,0.722269,0.616642,0.628068,0.904229
25%,0.750063,0.688034,0.715326,0.944066
50%,0.775435,0.713129,0.733594,0.956413
75%,0.803243,0.758462,0.759569,0.968248
max,0.915372,0.884708,0.810903,0.997214


In [None]:
analogue_df.describe()

Unnamed: 0,query_sim,group_sim,sgs,sgs_score
count,183.0,183.0,183.0,183.0
mean,0.780751,0.724517,0.7358,0.956671
std,0.037106,0.053803,0.034078,0.018254
min,0.722269,0.616642,0.628068,0.904229
25%,0.750063,0.688034,0.715326,0.944066
50%,0.775435,0.713129,0.733594,0.956413
75%,0.803243,0.758462,0.759569,0.968248
max,0.915372,0.884708,0.810903,0.997214
