In [2]:
from DiverseSelector.metric import (bit_tanimoto,
                                    ComputeDistanceMatrix,
                                    distance_to_similarity,
                                    entropy,
                                    euc_bit,
                                    logdet,
                                    modified_tanimoto,
                                    pairwise_similarity_bit,
                                    shannon_entropy,
                                    tanimoto,
                                    total_diversity_volume,
                                    )

from DiverseSelector.feature import (DescriptorGenerator,
                                     feature_reader,
                                     FingerprintGenerator,
                                     get_features,
                                     )

  from rdkit.Chem import MCS


In [3]:
descriptor_1 = get_features(mol_file= "/Users/hasankhaleel/VScode_repos/DiverseSelector/DiverseSelector/test/data/drugbank_sample_set.sdf",
                                          feature_type="descriptor",
                                          desc_type="rdkit_frag",
                                          use_fragment=True,
                                          ipc_avg=True)

fingerprint_1 = get_features(mol_file= "/Users/hasankhaleel/VScode_repos/DiverseSelector/DiverseSelector/test/data/drugbank_sample_set.sdf",
                                          feature_type="fingerprint")

descriptor_2 = get_features(mol_file= "/Users/hasankhaleel/VScode_repos/DiverseSelector/DiverseSelector/test/data/drugbank_sample_set_2.sdf",
                                          feature_type="descriptor",
                                          desc_type="rdkit_frag",
                                          use_fragment=True,
                                          ipc_avg=True)

fingerprint_2 = get_features(mol_file= "/Users/hasankhaleel/VScode_repos/DiverseSelector/DiverseSelector/test/data/drugbank_sample_set_2.sdf",
                                          feature_type="fingerprint")

#### Euclidean distance
$$D_{a, b} =\Bigg[\displaystyle\sum_{j=1}^{n} (X_{ja} - X_{jb})^2\Bigg]^{0.5} $$

In [4]:
sci_dist_1 = ComputeDistanceMatrix(descriptor_1, "euclidean")
desc_1 = sci_dist_1.compute_distance()
sci_dist_2 = ComputeDistanceMatrix(descriptor_2, "euclidean")
desc_2 = sci_dist_2.compute_distance()
print(desc_1)

[[ 0.         63.82763257 16.31033535 ... 11.69751122  6.85761877
  18.96320907]
 [63.82763257  0.         78.90973652 ... 74.58822551 62.77429442
  81.93588319]
 [16.31033535 78.90973652  0.         ...  6.59567814 17.40944959
   5.21669579]
 ...
 [11.69751122 74.58822551  6.59567814 ...  0.         13.00320589
   8.43577609]
 [ 6.85761877 62.77429442 17.40944959 ... 13.00320589  0.
  19.78386583]
 [18.96320907 81.93588319  5.21669579 ...  8.43577609 19.78386583
   0.        ]]


#### Tanimoto similarity
$$ S_{a,b} =\frac{\Bigg[\displaystyle\sum_{j=1}^{n} X_{ja} X_{jb}\Bigg]}{\Bigg[\displaystyle\sum_{j=1}^{n} (X_{ja})^2 + \displaystyle\sum_{j=1}^{n} (X_{jb})^2 - \displaystyle\sum_{j=1}^{n} X_{ja} X_{jb}\Bigg]}$$

In [13]:
tanimoto_similarity_1 = pairwise_similarity_bit(desc_1, tanimoto)
tanimoto_similarity_2 = pairwise_similarity_bit(desc_2, tanimoto)
print(tanimoto_similarity_2)

[[1.         0.96078431 0.96078431 ... 0.96078431 0.96078431 0.96078431]
 [0.96078431 1.         0.96078431 ... 0.96078431 0.96078431 0.96078431]
 [0.96078431 0.96078431 1.         ... 0.96078431 0.96078431 0.96078431]
 ...
 [0.96078431 0.96078431 0.96078431 ... 1.         0.96078431 0.96078431]
 [0.96078431 0.96078431 0.96078431 ... 0.96078431 1.         0.96078431]
 [0.96078431 0.96078431 0.96078431 ... 0.96078431 0.96078431 1.        ]]


#### Modified Tanimoto similarity
$$ MT = \Big(\frac{2-p}{3}\Big)T_1 + \Big(\frac{1+p}{3}\Big)T_0 $$
$$ p = \frac{\displaystyle\sum_{l=1}^{n}X_{il} + \displaystyle\sum_{l=1}^{n}X_{jl}}{2n}$$
$$ T_1 = \frac{\displaystyle\sum_{l=1}^{n}X_{il}X_{jl}}{n - \displaystyle\sum_{l=1}^{n}(1 - X_{il})(1 - X_{jl})} $$
$$  T_0 = \frac{\displaystyle\sum_{l=1}^{n}(1 - X_{il})(1 - X_{jl})}{n - \displaystyle\sum_{l=1}^{n}X_{il}X_{jl}} $$

In [14]:
similarity_modified_tanimoto_1 = pairwise_similarity_bit(fingerprint_1, modified_tanimoto)
similarity_modified_tanimoto_2 = pairwise_similarity_bit(fingerprint_2, modified_tanimoto)
print(similarity_modified_tanimoto_2)

[[1.         0.33000128 0.33000128 ... 1.         0.32654107 0.31923619]
 [0.33000128 1.         0.32654107 ... 0.33000128 0.32295271 0.31539152]
 [0.33000128 0.32654107 1.         ... 0.33000128 0.32295271 0.31539152]
 ...
 [1.         0.33000128 0.33000128 ... 1.         0.32654107 0.31923619]
 [0.32654107 0.32295271 0.32295271 ... 0.32654107 1.         0.31141869]
 [0.31923619 0.31539152 0.31539152 ... 0.31923619 0.31141869 1.        ]]


#### Log-determinant fucntion
$$f_{logdet}(S) = log det(X[S]^TX[S]+I[S]) $$

In [7]:
logdet_diversity_1 = logdet(desc_1)
logdet_diversity_2 = logdet(desc_2)
print(logdet_diversity_1, logdet_diversity_2)

98.97936193428563 112.87396744628899


#### Entropy
$$ E = -\frac{\displaystyle\sum_{}^{}\frac{y_i}{N}ln\frac{y_i}{N}}{L\frac{ln2}{2}} $$

In [8]:
entropy_diversity_1 = entropy(desc_1)
entropy_diversity_2 = entropy(desc_2)
print(entropy_diversity_1, entropy_diversity_2)
# 0 is low divesrity, 1 is high divesrity

0.059364799312260255 0.05491955439594571


#### Shannon Entropy
$$ H(X) = \displaystyle\sum_{n}^{i = 1} -\frac{C_i(X)}{|X|}log\frac{C_i(X)}{|X|}

In [9]:
shannon_entropy_diversity_1 = shannon_entropy(desc_1)
shannon_entropy_diversity_2 = shannon_entropy(desc_2)
print(shannon_entropy_diversity_1, shannon_entropy_diversity_2)
# 0 is low divesrity, 1 is high divesrity

0.44676463198907096 0.42157704715282207


#### Total Diversity Volume
$$g(S) = \displaystyle\sum_{i<j}^{k}O(i,j)$$
$$O(i,j) = 
\begin{cases}
min(100,2r_0/r_{ij} - 1) & r_{ij}\leq2r_0 \\
0 & r_{ij} > r_0
\end{cases}
$$
$$
r_0 = d \sqrt{\frac{1}{k}}

In [10]:
total_diversity_1 = total_diversity_volume(desc_1)
total_diversity_2 = total_diversity_volume(desc_2)
print(total_diversity_1, total_diversity_2)
# low volume means less diversity

10839.83671769063 11811.036392347618
