In [None]:
!time pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2021.3.5.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2021.3.5.1

real	0m7.469s
user	0m4.543s
sys	0m0.821s


In [None]:
#Clustering code
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random
from pathlib import Path
import pandas as p


from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator

In [None]:
############# Clustering section
#The identification of potential cluster centroids
# hence we need to define functions to calculate the Tanimoto similarity and distance matrix.
def tanimoto_distance_matrix(fp_list):
    """Calculate distance matrix for fingerprint list"""
    dissimilarity_matrix = []
    # Notice how we are deliberately skipping the first and last items in the list
    # because we don't need to compare them against themselves
    for i in range(1, len(fp_list)):
        # Compare the current fingerprint against all the previous ones in the list
        similarities = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
        # Since we need a distance matrix, calculate 1-x for every element in similarity matrix
        dissimilarity_matrix.extend([1 - x for x in similarities])
    return dissimilarity_matrix

#Define clustering 
def cluster_fingerprints(fingerprints, cutoff=0.2):
    """Cluster fingerprints
    Parameters:
        fingerprints
        cutoff: threshold for the clustering
    """
    # Calculate Tanimoto distance matrix
    distance_matrix = tanimoto_distance_matrix(fingerprints)
    # Now cluster the data with the implemented Butina algorithm:
    clusters = Butina.ClusterData(distance_matrix, len(fingerprints), cutoff, isDistData=True)
    clusters = sorted(clusters, key=len, reverse=True)
    return clusters

In [None]:
asx= p.read_csv("Merged_asexual_libraries_hits and nonhits_nd.csv")
asx

Unnamed: 0.1,Unnamed: 0,Smiles,CHEMBLID,Hit,Compound Key
0,0,O=C(OCc1ccccc1)N1CC[C@H]2CC(CO)O[C@@H]2C1,CHEMBL3470688,No,DDD01082494
1,1,Cc1cc(C(=O)N(C)C)nc(C2(C)CCCN2c2ccccc2)n1,CHEMBL3447812,No,DDD01058126
2,2,CN(C)c1nc2c(c(N3CCC(O)CC3)n1)CN(CC1CCNC1)CC2,CHEMBL3486462,No,DDD01257825
3,3,O=C1CCC(C(=O)Nc2cnn(-c3ccccc3Br)c2)N1,CHEMBL3467813,No,DDD01078886
4,4,O=C(CCNc1ccccc1)NC1CCN(C(=O)C2CCCCC2)CC1,CHEMBL3452692,No,DDD01063562
...,...,...,...,...,...
122566,122566,CC(/C=C/C(=O)NO)=C\[C@@H](C)C(=O)c1ccc(N(C)C)cc1,CHEMBL99,Yes,GNF-Pf-1011
122567,122567,O=[N+]([O-])c1cc([N+](=O)[O-])c2cccnc2c1Nc1ccc...,CHEMBL603032,No,GNF-Pf-1014
122568,122568,COc1ccc2c(c1)N(C(=O)CSc1nc3ccccc3o1)C(C)(C)c1s...,CHEMBL585033,Yes,GNF-Pf-4623
122569,122569,CCN1/C(=C/c2cccc[n+]2C)Sc2ccccc21,CHEMBL583805,Yes,GNF-Pf-1023


In [None]:
as_nhts= asx[asx["Hit"]== "No"]
print(len(as_nhts))

df_smiles = as_nhts[['Smiles',"Compound Key", "Hit"]]
print(len(df_smiles))
df_smiles

92178
92178


Unnamed: 0,Smiles,Compound Key,Hit
0,O=C(OCc1ccccc1)N1CC[C@H]2CC(CO)O[C@@H]2C1,DDD01082494,No
1,Cc1cc(C(=O)N(C)C)nc(C2(C)CCCN2c2ccccc2)n1,DDD01058126,No
2,CN(C)c1nc2c(c(N3CCC(O)CC3)n1)CN(CC1CCNC1)CC2,DDD01257825,No
3,O=C1CCC(C(=O)Nc2cnn(-c3ccccc3Br)c2)N1,DDD01078886,No
4,O=C(CCNc1ccccc1)NC1CCN(C(=O)C2CCCCC2)CC1,DDD01063562,No
...,...,...,...
122557,O=C(CNCCc1ccccc1)NC(c1ccccc1)c1ccccc1.O=CO.O=CO,GNF-Pf-3449,No
122558,Cc1ccc(COc2ccc3c(C#N)c4ccccn4c3c2)cc1,GNF-Pf-3454,No
122559,O=C(CN1CCN(S(=O)(=O)c2cccc([N+](=O)[O-])c2)CC1...,GNF-Pf-2736,No
122560,CC(C)N(CC(=O)Nc1cc(C(C)(C)C)nn1-c1ccc(Cl)c(Cl)...,GNF-Pf-2589,No


In [None]:
## Compound
compounds = []
df1 = df_smiles. iloc [:1300] 
df2 = df_smiles. iloc [1301:2600]
df3 = df_smiles. iloc [2601:3900]
df4 = df_smiles. iloc [3901:4300] #note error observed between compounds 4301-4310
df5 = df_smiles. iloc [4310:4600]

for _, compound_key, smiles in df1[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))

for _, compound_key, smiles  in df2[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
for _, compound_key, smiles in df3[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
for _, compound_key, smiles in df4[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))

for _, compound_key, smiles in df5[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))

df6 = df_smiles. iloc [4650:5600]
for _, compound_key, smiles in df6[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))

df7 = df_smiles. iloc [5700:6150]
for _, compound_key, smiles in df7[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df8 = df_smiles. iloc [6180:7970]   
for _, compound_key, smiles  in df8[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 

df9 = df_smiles. iloc [8000:10730]      
for _, compound_key, smiles  in df9[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 
    
df10 = df_smiles. iloc [10740:11410]
for _, compound_key, smiles in df10[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 

df11 = df_smiles. iloc [11420:14010]
for _, compound_key, smiles in df11[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df12 = df_smiles. iloc [14020:14580]
for _, compound_key, smiles in df12[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df13 = df_smiles. iloc [14590:15794]
for _, compound_key, smiles  in df13[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))

df14 = df_smiles. iloc [15795:18061] 
for _, compound_key, smiles  in df14[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df15 = df_smiles. iloc [18062:20531]    
for _, compound_key, smiles  in df15[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))  
    
df16 = df_smiles. iloc [20532:20964]
for _, compound_key, smiles in df16[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df17 = df_smiles. iloc [20966:22471]
for _, compound_key, smiles in df17[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))  

df18 = df_smiles. iloc [22473:25150]
for _, compound_key, smiles in df18[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df19 = df_smiles. iloc [25153:25549]
for _, compound_key, smiles  in df19[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 
    
df20= df_smiles. iloc [25551:28743]
for _, compound_key, smiles in df20[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 
    
df21= df_smiles. iloc [28755:30045]
for _, compound_key, smiles  in df21[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df22= df_smiles. iloc [30047:30857]
for _, compound_key, smiles  in df22[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df23= df_smiles. iloc [30859:31442]
for _, compound_key, smiles  in df23[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df24= df_smiles. iloc [31444:31831]
for _, compound_key, smiles in df24[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df25= df_smiles. iloc [31833:33515]
for _, compound_key, smiles  in df25[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 

df26= df_smiles. iloc [33517:33577]
for _, compound_key, smiles  in df26[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df27= df_smiles. iloc [33579:35623]
for _, compound_key, smiles in df27[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key)) 
    
df28= df_smiles. iloc [35625:36239]
for _, compound_key, smiles  in df28[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df29= df_smiles. iloc [36240:38411]
for _, compound_key, smiles in df29[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key )) 
    
df30= df_smiles. iloc [38413:38536]
for _, compound_key, smiles in df30[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df31= df_smiles. iloc [38538:39460]
for _, compound_key, smiles in df31[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df32= df_smiles. iloc [39462:39880]
for _, compound_key, smiles in df32[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df33= df_smiles. iloc [39882:44468]
for _, compound_key, smiles  in df33[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df34= df_smiles. iloc [44479:45910]
for _, compound_key, smiles  in df34[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df35= df_smiles. iloc [45912:50308]
for _, compound_key, smiles in df35[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df36= df_smiles. iloc [50310:50644]
for _, compound_key, smiles in df36[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df37= df_smiles. iloc [50646:50847]
for _, compound_key, smiles in df37[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df38= df_smiles. iloc [50849:51016]
for _, compound_key, smiles  in df38[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df39= df_smiles. iloc [51018:52255]
for _, compound_key, smiles  in df39[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df40= df_smiles. iloc [52257:52515]
for _, compound_key, smiles  in df40[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df41= df_smiles. iloc [52517:54570]
for _, compound_key, smiles in df41[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df42= df_smiles. iloc [54572:56375]
for _, compound_key, smiles in df42[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df43= df_smiles. iloc [56377:57950]
for _, compound_key, smiles  in df43[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df44= df_smiles. iloc [57952:60363]
for _, compound_key, smiles  in df44[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df45= df_smiles. iloc [60365:61892]
for _, compound_key, smiles in df45[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df46= df_smiles. iloc [61894:86486]
for _, compound_key, smiles in df46[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df47= df_smiles. iloc [86488:87067]
for _, compound_key, smiles in df47[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df48= df_smiles. iloc [87069:89383]
for _, compound_key, smiles  in df48[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))

df49= df_smiles. iloc [89385:90154]
for _, compound_key, smiles in df49[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df50= df_smiles. iloc [90156:90354]
for _, compound_key, smiles in df50[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df51= df_smiles. iloc [90356:90609]
for _, compound_key, smiles  in df51[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df52= df_smiles. iloc [90611:90746]
for _, compound_key, smiles in df52[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
df53= df_smiles. iloc [90748:92006]
for _, compound_key, smiles  in df53[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key))
    
df54= df_smiles. iloc [92008:]
for _, compound_key, smiles  in df54[["Compound Key", "Smiles"]].itertuples():
    compounds.append((Chem.MolFromSmiles(smiles), compound_key ))
    
len(compounds)

91817

In [None]:
from rdkit.Chem import rdFingerprintGenerator
# Create fingerprints for all molecules
rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
# fingerprints = [rdkit_gen.GetFingerprint(mol) for mol, compound_key in compounds]
fingerprints = [rdkit_gen.GetFingerprint(mol) for mol, idx in compounds]

# How many compounds/fingerprints do we have?
print("Number of compounds converted:", len(fingerprints))
print("Fingerprint length per compound:", len(fingerprints[0]))
fingerprints

Number of compounds converted: 91817
Fingerprint length per compound: 2048


[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f6de07030>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52ebfb70>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52ebfdf0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52faeda0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52fae530>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52fae800>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52fae120>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f52632210>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f6c796a80>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f51ed3b20>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f51ed3da0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f51ed3f30>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f51ed39e0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1f51ed3ee0>,
 <rdkit.DataStructs.cDataStructs.E

In [None]:
#Define clustering 
def cluster_fingerprints(fingerprints, cutoff=0.2):
    """Cluster fingerprints
    Parameters:
        fingerprints
        cutoff: threshold for the clustering
    """
    # Calculate Tanimoto distance matrix
    distance_matrix = tanimoto_distance_matrix(fingerprints)
    # Now cluster the data with the implemented Butina algorithm:
    clusters = Butina.ClusterData(distance_matrix, len(fingerprints), cutoff, isDistData=True)
    clusters = sorted(clusters, key=len, reverse=True)
    return clusters

In [None]:
 #Now split
# dfah
print(len(fingerprints))
set1 = fingerprints[:15000] 
set2 = fingerprints[15001:30000] 
set3 = fingerprints[30001:45000] 
set4 = fingerprints[45001:60000]
set5 = fingerprints[60001:75000]  
set6 = fingerprints[75001:85000] 
set7 = fingerprints[85001:] 

91817


In [None]:
##################################SET 1
# Run the clustering procedure for the dataset
clusters1 = cluster_fingerprints(set1, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters1 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters1 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters1 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters1 if len(c) > 100)

print("total # clusters: ", len(clusters1))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  5312
# clusters with only 1 compound:  2760
# clusters with >5 compounds:  536
# clusters with >25 compounds:  44
# clusters with >100 compounds:  0


In [None]:
##################################SET 2
# Run the clustering procedure for the dataset
clusters2 = cluster_fingerprints(set2, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters2 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters2 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters2 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters2 if len(c) > 100)

print("total # clusters: ", len(clusters2))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  5298
# clusters with only 1 compound:  2825
# clusters with >5 compounds:  544
# clusters with >25 compounds:  47
# clusters with >100 compounds:  0


In [None]:
##################################SET 3
# Run the clustering procedure for the dataset
clusters3 = cluster_fingerprints(set3, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters3 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters3 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters3 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters3 if len(c) > 100)

print("total # clusters: ", len(clusters3))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  5299
# clusters with only 1 compound:  2761
# clusters with >5 compounds:  548
# clusters with >25 compounds:  36
# clusters with >100 compounds:  0


In [None]:
##################################SET 4
# Run the clustering procedure for the dataset
clusters4 = cluster_fingerprints(set4, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters4 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters4 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters4 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters4 if len(c) > 100)

print("total # clusters: ", len(clusters4))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  5243
# clusters with only 1 compound:  2765
# clusters with >5 compounds:  534
# clusters with >25 compounds:  47
# clusters with >100 compounds:  0


In [None]:
##################################SET 5
# Run the clustering procedure for the dataset
clusters5 = cluster_fingerprints(set5, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters5 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters5 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters5 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters5 if len(c) > 100)

print("total # clusters: ", len(clusters5))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  4266
# clusters with only 1 compound:  2462
# clusters with >5 compounds:  456
# clusters with >25 compounds:  70
# clusters with >100 compounds:  14


In [None]:
##################################SET 6
# Run the clustering procedure for the dataset
clusters6 = cluster_fingerprints(set6, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters6 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters6 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters6 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters6 if len(c) > 100)

print("total # clusters: ", len(clusters6))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  2036
# clusters with only 1 compound:  952
# clusters with >5 compounds:  328
# clusters with >25 compounds:  66
# clusters with >100 compounds:  9


In [None]:
##################################SET 7
# Run the clustering procedure for the dataset
clusters7 = cluster_fingerprints(set7, cutoff=0.4)

In [None]:
# Give a short report about the numbers of clusters and their sizes
num_clust1_g1 = sum(1 for c in clusters7 if len(c) == 1)
num_clust1_g5 = sum(1 for c in clusters7 if len(c) > 5)
num_clust1_g25 = sum(1 for c in clusters7 if len(c) > 25)
num_clust1_g100 = sum(1 for c in clusters7 if len(c) > 100)

print("total # clusters: ", len(clusters7))
print("# clusters with only 1 compound: ", num_clust1_g1)
print("# clusters with >5 compounds: ", num_clust1_g5)
print("# clusters with >25 compounds: ", num_clust1_g25)
print("# clusters with >100 compounds: ", num_clust1_g100)

total # clusters:  1689
# clusters with only 1 compound:  587
# clusters with >5 compounds:  230
# clusters with >25 compounds:  30
# clusters with >100 compounds:  3


In [None]:
# Get the cluster center of each cluster (first molecule in each cluster)
cluster_centers1 = [compounds[c[0]] for c in clusters1]
cluster_centers2 = [compounds[c[0]] for c in clusters2]
cluster_centers3 = [compounds[c[0]] for c in clusters3]
cluster_centers4 = [compounds[c[0]] for c in clusters4]
cluster_centers5 = [compounds[c[0]] for c in clusters5]
cluster_centers6 = [compounds[c[0]] for c in clusters6]
cluster_centers7 = [compounds[c[0]] for c in clusters7]
p =len(clusters1)+len(clusters2)+len(clusters3)+len(clusters4)+len(clusters5)+len(clusters6)+len(clusters7)
# How many cluster centers/clusters do we have?
print("Number of cluster centers 1:", len(cluster_centers1))
print("Number of cluster centers 2:", len(cluster_centers2))
print("Number of cluster centers 3:", len(cluster_centers3))
print("Number of cluster centers 4:", len(cluster_centers4))
print("Number of cluster centers 5:", len(cluster_centers5))
print("Number of cluster centers 6:", len(cluster_centers6))
print("Number of cluster centers 7:", len(cluster_centers7))
print("Number of cluster centers total:", p)


Number of cluster centers 1: 5312
Number of cluster centers 2: 5298
Number of cluster centers 3: 5299
Number of cluster centers 4: 5243
Number of cluster centers 5: 4266
Number of cluster centers 6: 2036
Number of cluster centers 7: 1689
Number of cluster centers total: 29143


In [None]:
# Recompute fingerprints for 10 first clusters
mol_fps_per_cluster = []
for cluster in clusters1:
    mol_fps_per_cluster.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])


[[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e73f946c0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e73f936c0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63210>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63580>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63c10>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63df0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a638a0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63ad0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63ee0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63f30>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a63030>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a636c0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a630d0>,
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f1e74a631c0>,
  <rdkit.DataStructs

In [None]:
print("Number of mol in mol_fps_per_cluster",len(mol_fps_per_cluster))
print("Number of clusters in cluster_centers1:", len(cluster_centers1))

Number of mol in mol_fps_per_cluster 5312
Number of clusters in cluster_centers1: 5312


In [None]:
mol = []
for cluster in clusters1:
    mol.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])

print("Number of mol in mol",len(mol))
print("Number of clusters in cluster_centers1:", len(cluster_centers1)) 

mol2 = []
for cluster in clusters2:
    mol.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])

print("Number of mol in mol2",len(mol2))
print("Number of clusters in cluster_centers2:", len(cluster_centers2)) 

mol3 = []
for cluster in clusters3:
    mol3.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])

print("Number of mol in mol3",len(mol3))
print("Number of clusters in cluster_centers3:", len(cluster_centers3)) 

mol4 = []
for cluster in clusters4:
    mol4.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])

print("Number of mol in mol4",len(mol4))
print("Number of clusters in cluster_centers4:", len(cluster_centers4)) 

mol5 = []
for cluster in clusters5:
    mol5.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])

print("Number of mol in mol5",len(mol5))
print("Number of clusters in cluster_centers5:", len(cluster_centers5)) 

mol6 = []
for cluster in clusters6:
    mol6.append([rdkit_gen.GetFingerprint(compounds[i][0]) for i in cluster])

print("Number of mol in mol6",len(mol6))
print("Number of clusters in cluster_centers6:", len(cluster_centers6)) 

mol7 = []
for cluster in clusters7:
    mol7.append([compounds[i][0]) for i in cluster])

print("Number of mol in mol7",len(mol7))
print("Number of clusters in cluster_centers7:", len(cluster_centers7)) 

SyntaxError: ignored

In [None]:
mol = []
for cluster in clusters1:
    mol.append([compounds[i][0] for i in cluster])

print("Number of mol in mol",len(mol))
print("Number of clusters in cluster_centers1:", len(cluster_centers1)) 

mol2 = []
for cluster in clusters2:
    mol2.append([compounds[i][0] for i in cluster])

print("Number of mol in mol2",len(mol2))
print("Number of clusters in cluster_centers2:", len(cluster_centers2)) 

mol3 = []
for cluster in clusters3:
    mol3.append([compounds[i][0] for i in cluster])

print("Number of mol in mol3",len(mol3))
print("Number of clusters in cluster_centers3:", len(cluster_centers3)) 

mol4 = []
for cluster in clusters4:
    mol4.append([compounds[i][0] for i in cluster])

print("Number of mol in mol4",len(mol4))
print("Number of clusters in cluster_centers4:", len(cluster_centers4)) 

mol5 = []
for cluster in clusters5:
    mol5.append([compounds[i][0] for i in cluster])

print("Number of mol in mol5",len(mol5))
print("Number of clusters in cluster_centers5:", len(cluster_centers5)) 

mol6 = []
for cluster in clusters6:
    mol6.append([compounds[i][0] for i in cluster])

print("Number of mol in mol6",len(mol6))
print("Number of clusters in cluster_centers6:", len(cluster_centers6)) 

mol7 = []
for cluster in clusters7:
    mol7.append([compounds[i][0] for i in cluster])

print("Number of mol in mol7",len(mol7))
print("Number of clusters in cluster_centers7:", len(cluster_centers7)) 




Number of mol in mol 5312
Number of clusters in cluster_centers1: 5312
Number of mol in mol2 5298
Number of clusters in cluster_centers2: 5298
Number of mol in mol3 5299
Number of clusters in cluster_centers3: 5299
Number of mol in mol4 5243
Number of clusters in cluster_centers4: 5243
Number of mol in mol5 4266
Number of clusters in cluster_centers5: 4266
Number of mol in mol6 2036
Number of clusters in cluster_centers6: 2036
Number of mol in mol7 1689
Number of clusters in cluster_centers7: 1689


In [None]:
selected_mols= []
selected_mols.append(mol)
selected_mols.append(mol2)
selected_mols.append(mol3)
selected_mols.append(mol4)
selected_mols.append(mol5)
selected_mols.append(mol6)
selected_mols.append(mol7)

len(selected_mols)

7

In [None]:
ml= []

for mols in selected_mols:
  for mol in mols:
    ml.append(mol)

print(len(ml))

29143


In [None]:
smi = []
mol= []
for cluster in clusters3:
    mol =  [compounds[i][0] for i in cluster]
    smi = [*smi, mol] 
    #smi.append(Chem.MolToSmiles(mol))

print("Number of mol in mol3",len(smi))
print("Number of clusters in cluster_centers3:", len(cluster_centers3)) 
smi

Number of mol in mol3 5299
Number of clusters in cluster_centers3: 5299


[[<rdkit.Chem.rdchem.Mol at 0x7f1f52ca3f30>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52ee9580>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52e7c3f0>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52e6c800>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52e46e40>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52e26670>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52e43620>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52cb5350>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52ca1e90>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52d129e0>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52bf0cb0>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52beed50>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52aa6580>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52905990>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52906f80>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52906080>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52819940>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f52604030>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f5258d120>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f5258fb70>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f521e8cb0>,
  <rdkit.Chem.rdchem.Mol at 0x7f1f521e8e90>,
  <rdkit.C

In [None]:
# output list
output = []
  
# function used for removing nested 
# lists in python. 
def reemovNestings(l):
    for i in l:
        if type(i) == list:
            reemovNestings(i)
        else:
            output.append(i)
  
#test
reemovNestings(ml)


In [None]:
len(output)

91811

In [None]:
from operator import itemgetter

flist = list(map(itemgetter(0), ml))
print(flist)
print(len(flist))

[<rdkit.Chem.rdchem.Mol object at 0x7f1f5214f080>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52f37120>, <rdkit.Chem.rdchem.Mol object at 0x7f1f521172b0>, <rdkit.Chem.rdchem.Mol object at 0x7f1f520af5d0>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52c25170>, <rdkit.Chem.rdchem.Mol object at 0x7f1f527536c0>, <rdkit.Chem.rdchem.Mol object at 0x7f1f520b0670>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52b49c10>, <rdkit.Chem.rdchem.Mol object at 0x7f1f4f5bf580>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52766bc0>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52a3e210>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52c35530>, <rdkit.Chem.rdchem.Mol object at 0x7f1f529078a0>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52b67d50>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52c0a300>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52901c10>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52155b70>, <rdkit.Chem.rdchem.Mol object at 0x7f1f52150120>, <rdkit.Chem.rdchem.Mol object at 0x7f1f528ffe90>, <rdkit.Chem.rdchem.Mol object at 0x7f1f5212e3a0>,

In [None]:
smi= []
for mol in flist:
  smi.append(Chem.MolToSmiles(mol))

print(smi)
print(len(smi))


['Cc1cc(C)cc(C(=O)N2CCCC3(CCCC(=O)N3C)C2)c1', 'CC(C)c1nc(C2CCCN2CCN2CCCCC2)no1', 'CC(=O)N1CCO[C@@H]2CN(C(=O)c3ccccc3)CC[C@@H]2C1', 'O=C(c1ccccn1)N1C[C@@H]2CC[C@H](C1)CN2C(=O)CCN1CCCC1', 'c1ccc(CN2CCc3ncnc(N4CCCC4)c3CC2)nc1', 'O=C(c1ccc[nH]1)N1CCn2cc(CN3CCCC3)nc2C1', 'Cc1ccc(C(=O)N2C[C@@H]3CCCO[C@@H]3[C@H](N(C)Cc3ccco3)C2)o1', 'CCN(CC)Cc1ccc(C(=O)N2CCCC2Cn2cccn2)cc1', 'CC(=O)N1CCCC2(C1)CN(C(=O)C1CCCCC1)CCO2', 'Cc1noc(C)c1CN1CC2(CCCCN2C)CC1=O', 'c1cnc(N2CCOC3(CCN(CC4CCCCC4)CC3)C2)nc1', 'Cc1nn(C)c(C)c1C1CCCN1C(=O)c1ccc(C(N)=O)cn1', 'Cc1nn(C)c(N(C)C)c1CNCc1ccnc(N2CCCCC2)c1', 'O=C1N(Cc2ccc(F)c(F)c2)CCCC12CCN(C1CCCCC1)C2', 'O=C(NCC1CCOCC1)[C@@H]1CC[C@@H]2[C@@H](CCN2Cc2ccncc2)O1', 'CN1C(=O)C(c2ccccc2)CC12CCN(CCN1CCCC1=O)CC2', 'Cc1nc(CN(C)C(=O)C2CCC(=O)N(CCN(C)C)C2)cs1', 'CC(=O)N1CCC2(CC1)C(=O)N(CC1CCN(C)CC1)c1ccccc12', 'CCn1ncc2c1C(COC)CN(Cc1cnn(C)c1)C2', 'Cc1cccc(N2CCC3(CCCN(C(=O)c4cccnc4)C3)C2)n1', 'CCn1c(CN(C)C)nnc1C1CCN(Cc2ccccc2)CC1', 'CCCN(Cc1cccnc1)C(=O)C1CCCN(C)C1', 'COc1c(CNC2CCCc3cc

In [None]:
import pandas as pd

df= pd.DataFrame()
df["Smiles"]= smi
df

Unnamed: 0,Smiles
0,Cc1cc(C)cc(C(=O)N2CCCC3(CCCC(=O)N3C)C2)c1
1,CC(C)c1nc(C2CCCN2CCN2CCCCC2)no1
2,CC(=O)N1CCO[C@@H]2CN(C(=O)c3ccccc3)CC[C@@H]2C1
3,O=C(c1ccccn1)N1C[C@@H]2CC[C@H](C1)CN2C(=O)CCN1...
4,c1ccc(CN2CCc3ncnc(N4CCCC4)c3CC2)nc1
...,...
29138,O=C1CC[C@H]2[C@H](CCN2C(=O)C2CCOC2)N1Cc1ccncc1
29139,Cc1cc(Nc2ccc(F)cn2)cc(C2CCN(Cc3ncc[nH]3)C2)n1
29140,O=C(CCNc1ccccc1)NC1CCN(C(=O)C2CCCCC2)CC1
29141,CN(C)c1nc2c(c(N3CCC(O)CC3)n1)CN(CC1CCNC1)CC2


In [None]:
from google.colab import files

df.to_csv('Asx_nhtsUndr_Smiles.csv')
files.download('Asx_nhtsUndr_Smiles.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>