# Jupyter Notebook for the Data Analysis of Alkyne Semihydrogenation

This notebook is inspired/based on work and analyses done in:

* S. K. Kariofillis, A. G. Doyle *et al.*, *J. Am. Chem. Soc.* **2022**, *144*, 1045-1055. (https://pubs.acs.org/doi/10.1021/jacs.1c12203)
* https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html (last accessed 10.08.22)
* talktorial from the Volkamer lab: https://projects.volkamerlab.org/teachopencadd/talktorials/T006_compound_maximum_common_substructures.html (last accessed 10.08.22)

In [None]:
#Import libraries

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#working with arrays
import numpy as np
from numpy import percentile

#working with dataframes
import os,sys,shutil,glob,pickle
import pandas as pd
import xlrd

#working with molecules
from rdkit import Chem

#Matplotlib, seaborn and associated plotting modules
import matplotlib
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns


## A. Load Data and Delete NaN

In [None]:
#********************************************************************
#Workbook Sheet containing DFT calculated descriptors
#********************************************************************
df1 = pd.read_excel("DataSet_InternalAlkynes.xlsx",sheet_name="DataSheet")
df1.drop(0,axis=0, inplace=True)
df1.dropna(inplace=True)
df1.set_index("Alkyne Number", inplace=True)

df2 = pd.read_excel("DataSet_TerminalAlkynes.xlsx",sheet_name="DataSheet")
df2.drop(0,axis=0, inplace=True)
df2.drop(684,axis=0,inplace=True) #borane weird valence structure
df2.dropna(inplace=True)
df2["Alkyne Number"] = df2["Alkyne Number"].astype(int) + len(df1)
df2.set_index("Alkyne Number", inplace=True)

df = df1.append(df2)
df.drop(["NImag"],axis=1, inplace=True)


#drop different columns - SMILES and not averaged descriptors
Smiles = pd.DataFrame(df["SMILES"])
df.drop(["SMILES"],axis=1, inplace=True)
df.drop(columns=["x26","x27"], inplace = True)
df.drop(columns=["x30","x31"], inplace = True)
df.drop(columns=["x34","x35","x36","x37","x38","x39"], inplace = True)
df.drop(columns=["x46","x47","x48","x49","x50","x51"], inplace = True)
df.drop(columns=["x61","x62","x63","x64","x65","x66","x67","x68","x69","x70","x71","x72","x73","x74","x75","x76","x77","x78","x79","x80","x81"], inplace = True)

#to be checked for search for tested candidates
inchi = Smiles["SMILES"].map(Chem.MolFromSmiles).map(Chem.MolToInchi)
Smiles = Smiles[~inchi.duplicated()]#.set_index('SMILES')

#********************************************************************
#Workbook Sheet containing SMILES of tested substrates
#********************************************************************
subs = pd.read_excel("DataSet_Substrates.xlsx",sheet_name="DataSheet")
subs.drop(["Substrate"],axis=1, inplace=True)
subs.dropna(inplace=True)
subs["SMILES"] = subs["SMILES"].map(Chem.MolFromSmiles).map(Chem.MolToInchi)
inchi_subs = subs

#********************************************************************
#Find candidates that were used as substrates
#********************************************************************
Candidates=inchi.where(inchi.isin(inchi_subs["SMILES"])==True).dropna()
condition=inchi_subs.index[inchi_subs["SMILES"].isin(inchi)==True].tolist()
condition2=inchi_subs.where(inchi.isin(inchi_subs["SMILES"])==True).dropna()
Candidates=pd.DataFrame(Candidates)
Candidates["copy_index"] = Candidates.index
Candidates = pd.merge(Candidates, inchi_subs, on=['SMILES'], how='inner')
Candidates.set_index("copy_index",inplace=True)

#********************************************************************
#Workbook Sheet containing SMILES of literature substrates
#********************************************************************
lit_subs = pd.read_excel("DataSet_LiteratureSubstrates.xlsx",sheet_name="DataSheet")
lit_subs.rename(columns=lit_subs.iloc[0], inplace=True)
lit_subs.drop(0,axis=0, inplace=True)
lit_subs.drop(lit_subs.iloc[:,7:],axis=1, inplace=True)
lit_subs.drop(["Substrate"],axis=1, inplace=True)
lit_subs.dropna(inplace=True)
lit_subs["SMILES"] = lit_subs["SMILES"].map(Chem.MolFromSmiles).map(Chem.MolToInchi)
lit_inchi_subs = lit_subs

#********************************************************************
#Find candidates that were used in literature
#********************************************************************
lit_Candidates=inchi.where(inchi.isin(lit_inchi_subs["SMILES"])==True).dropna()
lit_condition=lit_inchi_subs.index[lit_inchi_subs["SMILES"].isin(inchi)==True].tolist()
lit_condition2=lit_inchi_subs.where(inchi.isin(lit_inchi_subs["SMILES"])==True).dropna()
lit_Candidates=pd.DataFrame(lit_Candidates)
lit_Candidates["copy_index"] = lit_Candidates.index
lit_Candidates = pd.merge(lit_Candidates, lit_inchi_subs, on=['SMILES'], how='inner')
lit_Candidates.set_index("copy_index",inplace=True)
lit_Candidates["Number of Occurences"]=lit_Candidates["Number of Occurences"].astype(float)
lit_Candidates["Average Yield"]=lit_Candidates["Average Yield"].astype(float)

#Only Candidates that occured three times or more
lit_Candidates=lit_Candidates.where(lit_Candidates["Number of Occurences"]>2)

# B. Preprocessing of DFT Data 

In [None]:
#********************************************************************
#Standardize and eliminate collinear features
#********************************************************************
from sklearn.preprocessing import scale

# standardize
df=pd.DataFrame(scale(df),index=df.index, columns=df.columns)

# drop zero-variance features
zero_std_cols = df.columns[df.std() == 0]
df=df[df.columns.difference(zero_std_cols)]

print (f"Dropping {len(zero_std_cols)} features {zero_std_cols}")

# drop highly correlated features
df_corr = df.corr().abs()
upper = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df = df.drop(to_drop, axis=1)

print (f"Dropping {len(to_drop)} features {to_drop}")
print (f"Number of features left:  {len(df.columns)}")


In [None]:
#********************************************************************
#Pearson Correlation Matrix vor Visualization
#********************************************************************

# Generate a mask for the upper triangle
mask = np.zeros_like(df_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20, 15))
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(3)
ax.tick_params(length=12,width=3)
ax.tick_params(which='minor', length=6, width=3)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True, sep=100)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(df_corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, linewidths=.5)
cbar = ax.collections[0].colorbar
# here set the labelsize by 20
cbar.ax.tick_params(labelsize=15)

fig.suptitle('Correlation matrix of features', fontsize=25)


fig.tight_layout()
#fig.savefig('Correlation_Matrix.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')

# C. Clustering with PCA & UMAP

In [None]:
#********************************************************************
#Create PCA and UMAP embeddings from features
#********************************************************************

from sklearn.decomposition import PCA
from umap import UMAP

# define the dimensionalities of the reduced representation to study
dims = [len(df.columns), 30, 20, 10, 5, 2]

# dictionary to store data at different levels of dimensionality reduction
dfs={}

# UMAP section
n_neighbors = np.int(np.sqrt(df.shape[1]))
for dim in dims:
    key = f"umap{dim}"
    dfs[key] = pd.DataFrame(UMAP(n_components=dim, n_neighbors=n_neighbors, random_state=42).fit_transform(df),
                            index=df.index)
# PCA section
pc = pd.DataFrame(PCA(n_components=None).fit_transform(df), index=df.index)
for dim in dims:
    key = f"pc{dim}"
    dfs[key] = pc.iloc[:, :dim]

In [None]:
#********************************************************************
#Visualization of the embeddings
#********************************************************************

fig, axs = plt.subplots(1, 2, figsize=(12, 6)) 
dfs['pc2'].columns = ['PC1', 'PC2']
dfs['umap2'].columns = ['UMAP1', 'UMAP2']

#-----------------------------------------------------------------------------------------------------#
axs[0].scatter(x='PC1', y='PC2', data=dfs['pc2'], s=20, alpha=0.7, linewidth=0.25,
                  edgecolor='face')
for axis in ['top','bottom','left','right']:
    axs[0].spines[axis].set_linewidth(3)
axs[0].tick_params(length=12,width=3)
axs[0].set_ylim([-12,12])
axs[0].set_xlim([-10,30])
axs[0].xaxis.set_minor_locator(plt.MaxNLocator(4))
axs[0].xaxis.set_major_locator(plt.MaxNLocator(4))
axs[0].yaxis.set_major_locator(plt.MaxNLocator(4))
axs[0].set_yticks([-9,-3,3,9], minor=True)
axs[0].set_xticks([-5,5,15,25], minor=True)
axs[0].tick_params(which='minor', length=6, width=3,labelsize = 15)
axs[0].set_title("PC Projection", fontsize=18)
axs[0].set_xlabel('PC1', fontsize = 15)
axs[0].set_ylabel('PC1', fontsize = 15)
#-----------------------------------------------------------------------------------------------------#
axs[1].scatter(x='UMAP1', y='UMAP2', data=dfs['umap2'], s=20, alpha=0.7, linewidth=0.25,
                  edgecolor='face')
for axis in ['top','bottom','left','right']:
    axs[1].spines[axis].set_linewidth(3)
axs[1].tick_params(length=12,width=3)
axs[1].set_ylim([-12.5,22.5])
axs[1].set_xlim([-10,25])
axs[1].set_xticks([-10,-3,4,11,18,25], major=True)
axs[1].set_xticks([-6.5,0.5,7.5,14.5,21.5], minor=True)
axs[1].set_yticks([-12.5,-5.5,1.5,8.5,15.5,22.5],major=True)
axs[1].set_yticks([-9,-2,5,12,19], minor=True)
axs[1].tick_params(which='minor', length=6, width=3,labelsize = 15)
axs[1].set_title("UMAP Projection", fontsize=18)
axs[1].set_xlabel('UMAP1', fontsize = 15)
axs[1].set_ylabel('UMAP2', fontsize = 15)
#-----------------------------------------------------------------------------------------------------#

plt.tight_layout(h_pad=0.5, w_pad=5, pad=3)
#fig.savefig('PCAvsUMAP.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')

## Rationalize number of optimal clusters and reduced representations

Initial silhouette scores for number of reduced representations. The silhouette score should be as high as possible, while little fluctuations in score with different reduced representations indicate the optimal number of UMAPs.

In [None]:
#********************************************************************
#Silhouette Score Analysis for PCA and UMAP
#********************************************************************

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, fclusterdata
from sklearn.metrics import silhouette_score

#-----------------------------------------------------------------------------------------------------#
def silhouette_scores_hierarchical(data, n_cls_list):
    """helper function to compute a silhouette score for hierarchical clustering using Ward linkage"""
    z = linkage(data, method='ward')
    result = pd.Series(index=n_cls_list, dtype=float)
    for n_cls in n_cls_list:
        cls = fcluster(z, n_cls, criterion='maxclust')
        result.loc[n_cls] = silhouette_score(data, cls)
    return result
#-----------------------------------------------------------------------------------------------------#

# Define the numbeor of clusters to study
N_CLS_list = list(range(5, 21))

# populate silhouette scores for all number of clusters and all dimensionality reductions that are pre-calculated
silh_scores = pd.DataFrame(index=N_CLS_list)
for key, value in dfs.items():
    silh_scores[key] = silhouette_scores_hierarchical(value, N_CLS_list)

# plot the silhouette scores with visualized embeddings
grouped=silh_scores.groupby(silh_scores.columns.str.startswith('umap'), axis=1)

fig, axes = plt.subplots(1,2, figsize=(12,6))

for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
    grouped.get_group(key).plot(ax=ax)

    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(3)
    ax.tick_params(length=12,width=3,labelsize = 15)
    ax.set_xlabel('Number of Clusters', fontsize = 15)
    ax.set_ylabel('Average Silhouette Score', fontsize = 15)
    ax.legend(loc=1,fontsize=12)
    
plt.tight_layout(h_pad=0.5, w_pad=5, pad=3)
#fig.savefig('SilhouetteScore.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')


Things to look for in the silhouette analysis:
    
    a) above average score for all clusters
    b) uniform silhouette thickness
    c) no wide fluctuations in size

This part is based on a scikit-learn Jupyter notebook (https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html)

In [None]:
#********************************************************************
#Silhouette Analysis for UMAP with 5 embeddings
#********************************************************************

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import AgglomerativeClustering
import matplotlib.cm as cm

X=dfs["umap5"]
for n_clusters in range(5,20):
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
    
    #-----------------------------------------------------------------------------------------------------#
    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from [-1,1], we opt for [-0.1, 1] for depiction purposes
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters)*10 is demarcate silhouette plots of individual clusters.
    ax1.set_ylim([0, len(X) + (n_clusters) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility (as chosen in the scikit-learn example).
    
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.3 * size_cluster_i, str(i), fontsize=12)

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    for axis in ['top','bottom','left','right']:
        ax1.spines[axis].set_linewidth(3)
    ax1.tick_params(length=12,width=3, labelsize=15)
    #ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("Silhouette Coefficient Values", fontsize=15)
    ax1.set_ylabel("Cluster label", fontsize=15)

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
    #-----------------------------------------------------------------------------------------------------#
    
    #-----------------------------------------------------------------------------------------------------#
    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral((cluster_labels.astype(float)) / n_clusters)
    ax2.scatter(dfs['umap2']["UMAP1"], dfs['umap2']["UMAP2"], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    for axis in ['top','bottom','left','right']:
        ax2.spines[axis].set_linewidth(3)
    ax2.tick_params(length=12,width=3, labelsize=15)

    ax2.set_xlabel("UMAP1", fontsize=15)
    ax2.set_ylabel("UMAP2", fontsize=15)
    ax2.set_xticks([-10,-3,4,11,18,25], major=True)
    ax2.set_xticks([-6.5,0.5,7.5,14.5,21.5], minor=True)
    ax2.set_yticks([-12.5,-5.5,1.5,8.5,15.5,22.5],major=True)
    ax2.set_yticks([-9,-2,5,12,19], minor=True)
    ax2.tick_params(which='minor', length=6, width=3)
    #-----------------------------------------------------------------------------------------------------#
    
    plt.tight_layout(h_pad=0.5, w_pad=5, pad=3)
    plt.show()
    
    #save the plot with optimal number of clusters after checking
    #if i==5: #number to be changed here => (number of clusters - 1)
    #    fig.savefig('SilhouetteAnalysis_6clusters.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')

In [None]:
#********************************************************************
#Generation of Dendrogram with Optimal Hyperparameters
#********************************************************************

# final number of clusters to produce
NCLS = 6

# linkage and clustering for selected featurization
z = linkage(dfs['umap5'], method="ward")
cls = fcluster(z, NCLS, criterion='maxclust')

# plot the dendrogram
fig,ax=plt.subplots(figsize=(6, 6))
_=dendrogram(z, truncate_mode='lastp', p=NCLS, show_contracted=True,
             leaf_rotation=90, color_threshold=0)
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(3)
ax.tick_params(length=12,width=3,labelsize=15)
ax.tick_params(which='minor', length=6, width=3)
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=3)

#fig.savefig('Dendrogram.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')

In [None]:
#********************************************************************
#Plotting of Tested Substrates on Clustered Chemical Space Map
#********************************************************************

#Definition of color palette
color_palette = ["#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", "#F0E442"]
color_palette_rgb = [matplotlib.colors.to_rgb(i) for i in color_palette]

#Plotting of clustered chemical space (p1) with star-shaped tested entries (p2), surrounded by circles (p3)
fig, ax = plt.subplots(figsize=(6, 6))
p1 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'], s=20, alpha=0.7, linewidth=0.25, edgecolor='face',
                cmap=matplotlib.colors.LinearSegmentedColormap.from_list("bla",color_palette_rgb), c=cls)
p2 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'].loc[Candidates.index],  s=70, alpha=0.7, linewidth=0.25, edgecolor='black', c="black", marker="*")
p3 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'].loc[Candidates.index],  s=80, alpha=0.7, linewidth=1, edgecolor='black', facecolor="none")


for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(3)
ax.tick_params(length=12,width=3, labelsize=15)

ax.set_xlabel("UMAP1", fontsize=15)
ax.set_ylabel("UMAP2", fontsize=15)
ax.set_xticks([-10,-3,4,11,18,25], major=True)
ax.set_xticks([-6.5,0.5,7.5,14.5,21.5], minor=True)
ax.set_yticks([-12.5,-5.5,1.5,8.5,15.5,22.5],major=True)
ax.set_yticks([-9,-2,5,12,19], minor=True)
ax.tick_params(which='minor', length=6, width=3)

plt.tight_layout(h_pad=0.5, w_pad=5, pad=3)
plt.show()

#fig.savefig('Substrates_on_clusters.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')





In [None]:
#********************************************************************
#Plotting of Semihydrogenation Rates on Alkyne Chemical Space Map
#********************************************************************

#Plotting of clustered chemical space (p1) with harsher hydrogenation candidates (5 bar, 80 Â°C, 16 h) (p2)
# and mild hydrogenation candidates (1 bar, 80 Â°C, 10 h) (p3)
fig, ax = plt.subplots(figsize=(6, 6))
p1 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'], s=20, alpha=0.7, linewidth=0.25, edgecolor='face',
                 c="lightgrey")
p2 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'].loc[Candidates.index.where(Candidates["Condition"]=="B").dropna().astype(int)], s=Candidates["Rates"].where(Candidates["Condition"]=="B").dropna()/Candidates["Rates"].max()*100+25, alpha=.7, linewidth=0.25, edgecolor='face', color="darkred")
p3 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'].loc[Candidates.index.where(Candidates["Condition"]=="A").dropna().astype(int)], s=Candidates["Rates"].where(Candidates["Condition"]=="A").dropna()/Candidates["Rates"].max()*100+25, alpha=.7, linewidth=0.25, edgecolor='face', color="#69d")


for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(3)
ax.tick_params(length=12,width=3, labelsize=15)

ax.set_xlabel("UMAP1", fontsize=15)
ax.set_ylabel("UMAP2", fontsize=15)
ax.set_xticks([-10,-3,4,11,18,25], major=True)
ax.set_xticks([-6.5,0.5,7.5,14.5,21.5], minor=True)
ax.set_yticks([-12.5,-5.5,1.5,8.5,15.5,22.5],major=True)
ax.set_yticks([-9,-2,5,12,19], minor=True)
ax.tick_params(which='minor', length=6, width=3)

plt.tight_layout(h_pad=0.5, w_pad=5, pad=3)
plt.show()

#fig.savefig('Susbstrate_Rates_on_Space.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')


In [None]:
#********************************************************************
#Plotting of Literature Reported Semihydrogenation Yields 
#of Substrates Occurring more than 2 times
#********************************************************************
fig, ax = plt.subplots(figsize=(7, 6))
p1 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'], s=20, alpha=0.7, linewidth=0.25, edgecolor='face',
                 c="lightgrey")

p2 = plt.scatter(x="UMAP1", y="UMAP2", data=dfs['umap2'].loc[lit_Candidates.index], s=lit_Candidates["Number of Occurences"]/lit_Candidates["Number of Occurences"].max()*200, alpha=.7, linewidth=0.25, edgecolor='face', c=lit_Candidates["Average Yield"], cmap=sns.dark_palette("#69d", reverse=True, as_cmap=True))

cbar = plt.colorbar(p2)
plt.clim(0,100)
cbar.ax.tick_params(labelsize=12) 


for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(3)
ax.tick_params(length=12,width=3, labelsize=15)

ax.set_xlabel("UMAP1", fontsize=15)
ax.set_ylabel("UMAP2", fontsize=15)
ax.set_xticks([-10,-3,4,11,18,25], major=True)
ax.set_xticks([-6.5,0.5,7.5,14.5,21.5], minor=True)
ax.set_yticks([-12.5,-5.5,1.5,8.5,15.5,22.5],major=True)
ax.set_yticks([-9,-2,5,12,19], minor=True)
ax.tick_params(which='minor', length=6, width=3)

plt.tight_layout(h_pad=0.5, w_pad=5, pad=3)
plt.show()

#fig.savefig('Literature_on_clusters.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')

# D. Selection of Molecules from Clusters

#### Central Molecule of Each Cluster

In [None]:
#********************************************************************
#Selection of Central Molecules in Each Cluster
#********************************************************************
from scipy.spatial.distance import cdist
from rdkit.Chem import Draw

# build rdkit molecules for all candidates
mols = pd.Series(Smiles["SMILES"].map(Chem.MolFromSmiles), index=df.index).to_frame('mol')
features='umap5'

# How many central molecules to display?
n_per_cluster = 5
                                                                            
# store central candidates for
cands=[]

for group, data in mols.groupby(cls):
    # get descriptor data for this cluster
    desc_data=dfs[features].loc[data.index]
    
    # compute distances of these molecules to their center
    dists=pd.Series(cdist([desc_data.mean()], desc_data)[0],
                    index=desc_data.index)
    
    # select top n central molecules
    selected=dists.sort_values().head(n_per_cluster).index
    
    smi=mols.loc[selected]['mol'].map(Chem.MolToSmiles)
    smi=smi.reset_index(drop=True).to_frame(f"Cluster{group}")
    cands.append(smi)
    
    print (f"Cluster {group}, n molecules: {len(data)}")
    ms = data['mol'].loc[selected]
    display(Draw.MolsToGridImage(ms, molsPerRow=n_per_cluster))
    img=Draw.MolsToGridImage(ms,molsPerRow=n_per_cluster,subImgSize=(200,200),returnPNG=False)
    
    #Saving of every single molecule selection for cluster
    #img.save("Cluster_"+str(group)+'.tiff')   

cands = pd.concat(cands, axis=1)



#### Maximum Common Substructure (MCS)

Further information about the applied algorithm can be found here: http://www.rdkit.org/docs/Cookbook.html#using-custom-mcs-atom-types

In [None]:
from collections import defaultdict
from pathlib import Path
from copy import deepcopy
import random

from ipywidgets import interact, fixed, widgets

from rdkit import Chem, Geometry
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdFMCS
from rdkit.Chem import PandasTools



In [None]:
#********************************************************************
#Generate Dataframe Containing SMILES 
#********************************************************************
compound_df = df1.append(df2)
compound_df.drop(["NImag"],axis=1, inplace=True)
compound_df.drop(compound_df.columns[1:],axis=1,inplace=True)
compound_df["ID"]=compound_df.index
print("Dataframe shape:", compound_df.shape)
compound_df.head()

In [None]:
#********************************************************************
#Create SDF file with all molecules and IDs
#********************************************************************

PandasTools.AddMoleculeColumnToFrame(compound_df,'SMILES','ID') # pp = doesn't work for me
PandasTools.WriteSDF(compound_df, 'SDF.sdf', molColName='ID', properties=list(compound_df.columns))
sdf='SDF.sdf'
supplier = Chem.ForwardSDMolSupplier(sdf)
mols1 = list(supplier)

print(f"Set with {len(mols1)} molecules loaded.")


In [None]:
#********************************************************************
#Apply FMCS algorithm and attribute MCS to every cluster
#********************************************************************

mols_df=pd.DataFrame(mols1)

mols_df=mols_df.groupby(cls)
for group, data in mols_df:
    mols=data.iloc[:,0].values.tolist()
    print(f"Set with {len(mols)} molecules loaded.")
    mcs1 = rdFMCS.FindMCS(mols,threshold=0.50) #this value can be changed to assess for percentage of all molecules
    print(f"MCS1 contains {mcs1.numAtoms} atoms and {mcs1.numBonds} bonds.")
    print("MCS SMARTS string:", mcs1.smartsString)
    
    # Draw substructure from Smarts
    m1 = Chem.MolFromSmarts(mcs1.smartsString)
    display(Draw.MolToImage(m1, legend="Cluster "+str(group)))

In [None]:
#********************************************************************
#Function to highlight substructure in selected molecules
#********************************************************************

def highlight_molecules(molecules, mcs, number, label=True, same_orientation=True, **kwargs):
    """Highlight the MCS in our query molecules"""
    molecules = deepcopy(molecules)
    # convert MCS to molecule
    pattern = Chem.MolFromSmarts(mcs.smartsString)
    # find the matching atoms in each molecule
    matching = [molecule.GetSubstructMatch(pattern) for molecule in molecules[:number]]

    legends = None
    if label is True:
        legends = [x.GetProp("_Name") for x in molecules]

    # Align by matched substructure so they are depicted in the same orientation
    # Adapted from: https://gist.github.com/greglandrum/82d9a86acb3b00d3bb1df502779a5810
    if same_orientation:
        mol, match = molecules[0], matching[0]
        AllChem.Compute2DCoords(mol)
        coords = [mol.GetConformer().GetAtomPosition(x) for x in match]
        coords2D = [Geometry.Point2D(pt.x, pt.y) for pt in coords]
        for mol, match in zip(molecules[1:number], matching[1:number]):
            if not match:
                continue
            coord_dict = {match[i]: coord for i, coord in enumerate(coords2D)}
            AllChem.Compute2DCoords(mol, coordMap=coord_dict)

    return Draw.MolsToGridImage(
        molecules[:number],
        legends=legends,
        molsPerRow=5,
        highlightAtomLists=matching[:number],
        subImgSize=(200, 200),
        **kwargs,
    )

In [None]:
highlight_molecules(mols, mcs1, 20)

# E. Diversity of the Clustering Workflow

In [None]:
#********************************************************************
#Helper Functions for Sampling
#********************************************************************

from scipy.spatial.distance import pdist, squareform
def sample_dist(data):
    dists = pdist(data)
    return [ min(dists), np.mean(dists), max(dists)]

def kenStone(X, k, metric='euclidean'):
    # safety checks
    assert isinstance(k, int)
    assert k >= 2
    assert k <= X.shape[0]
    # distance matrix
    d = squareform(pdist(X, metric))
    # seed pick the pair that's furthest apart
    selected = list(np.unravel_index(np.argmax(d), d.shape))
    while len(selected) < k:
   
        #add sample whose minimum distance to the selected samples is largest
        selected.append(np.argmax(d[selected,].min(axis=0)))
    return selected


In [None]:
# make a handful of umaps with various random seeds
umaps = []
for i in range(50):
    umaps.append(pd.DataFrame(UMAP(n_components=5, n_neighbors=n_neighbors).fit_transform(df),
                          index=df.index))
    
# add the original umap that was used for clustering
umaps = [dfs['umap5']] + umaps

# selection of dataset to use
dat = umaps[27]
NCLS=6

In [None]:
#********************************************************************
#Clustering
#********************************************************************

ret_cls = []
for dat in umaps:
    # clustering
    z = linkage(dat, method="ward")
    cls = fcluster(z, NCLS, criterion='maxclust')
    
    selected = []
    for group, d in dat.groupby(cls):
        
        # compute distances of these molecules to their center
        dists=pd.Series(cdist([d.mean()], d)[0],
                        index=d.index)
        
        # select top n central molecules
        selected.append(dists.sort_values().index[0])
        
    dcls = sample_dist(dat.loc[selected])
    ret_cls.append(dcls)
    
ret_cls = pd.DataFrame(ret_cls, columns=[ 'clustering min. dist', 'clustering avg. dist', 'clustering max. dist'])

In [None]:
#********************************************************************
#Kennard-Stone algorithm
#********************************************************************

ret_ks = []
for dat in umaps:
    ks = kenStone(dat, NCLS)
    dks = sample_dist(dat.iloc[ks])
    ret_ks.append(dks)

ret_ks = pd.DataFrame(ret_ks, columns=[ 'ks min dist', 'ks avg dist', 'ks max dist'])

In [None]:
#********************************************************************
#Random Sampling
#********************************************************************

ret_rnd = []
for dat in umaps:
    for i in range(100):
        ret_rnd.append(sample_dist(dat.sample(NCLS)))
ret_rnd = pd.DataFrame(ret_rnd, columns=[ 'random min. dist', 'random avg. dist', 'random max. dist'])


In [None]:
#********************************************************************
#Plots
#********************************************************************

fig, ax = plt.subplots(1,3, figsize=(18, 6))
ret_cls.plot(kind='hist', histtype='step', facecolor='#008000', edgecolor='k', 
             fill=True,
             subplots=True, ax=ax, color='black', bins=25, xlim=(-1, 20), density=True)

ret_rnd.plot(kind='hist', histtype='step', color='#0000C0', linewidth=2, linestyle='--',
             subplots=True, ax=ax, bins=25, xlim=(-1, 20), density=True)

for axis in ['top','bottom','left','right']:
    ax[0].spines[axis].set_linewidth(3)
    ax[1].spines[axis].set_linewidth(3)
    ax[2].spines[axis].set_linewidth(3)
ax[0].tick_params(length=12,width=3,labelsize=15)
ax[1].tick_params(length=12,width=3,labelsize=15)
ax[2].tick_params(length=12,width=3,labelsize=15)
ax[0].tick_params(which='minor', length=6, width=3,labelsize=15)
ax[1].tick_params(which='minor', length=6, width=3,labelsize=15)
ax[2].tick_params(which='minor', length=6, width=3,labelsize=15)

plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=3)
ax[0].set_ylabel('Density',fontsize=15)
ax[1].set_ylabel('Density',fontsize=15)
ax[2].set_ylabel('Density',fontsize=15)

ax[1].legend(loc="upper left")
ax[2].legend(loc="upper left")

plt.show()
#fig.savefig('RandomvsClusteringSelection.tiff',dpi=600, pad_inches = .1, bbox_inches = 'tight')


