In [None]:
import umap
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from typing import List

from rdkit import RDLogger
from rdkit.Chem.rdchem import Mol
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
warnings.filterwarnings("ignore")

In [None]:
material = {
    "red":         {0: "#ffebee",1: "#ffcdd2",2: "#ef9a9a",3: "#e57373",4: "#ef5350",5: "#f44336",6: "#e53935",7: "#d32f2f",8: "#c62828",9: "#b71c1c",},
    "pink":        {0: "#fce4ec",1: "#f8bbd0",2: "#f48fb1",3: "#f06292",4: "#ec407a",5: "#e91e63",6: "#d81b60",7: "#c2185b",8: "#ad1457",9: "#880e4f",},
    "purple":      {0: "#f3e5f5",1: "#e1bee7",2: "#ce93d8",3: "#ba68c8",4: "#ab47bc",5: "#9c27b0",6: "#8e24aa",7: "#7b1fa2",8: "#6a1b9a",9: "#4a148c",},
    "dep purp":    {0: "#ede7f6",1: "#d1c4e9",2: "#b39ddb",3: "#9575cd",4: "#7e57c2",5: "#673ab7",6: "#5e35b1",7: "#512da8",8: "#4527a0",9: "#311b92",},
    "indigo":      {0: "#e8eaf6",1: "#c5cae9",2: "#9fa8da",3: "#7986cb",4: "#5c6bc0",5: "#3f51b5",6: "#3949ab",7: "#303f9f",8: "#283593",9: "#1a237e",},
    "blue":        {0: "#e3f2fd",1: "#bbdefb",2: "#90caf9",3: "#64b5f6",4: "#42a5f5",5: "#2196f3",6: "#1e88e5",7: "#1976d2",8: "#1565c0",9: "#0d47a1",},
    "lit blu":     {0: "#e1f5fe",1: "#b3e5fc",2: "#81d4fa",3: "#4fc3f7",4: "#29b6f6",5: "#03a9f4",6: "#039be5",7: "#0288d1",8: "#0277bd",9: "#01579b",},
    "cyan":        {0: "#e0f7fa",1: "#b2ebf2",2: "#80deea",3: "#4dd0e1",4: "#26c6da",5: "#00bcd4",6: "#00acc1",7: "#0097a7",8: "#00838f",9: "#006064",},
    "teal":        {0: "#e0f2f1",1: "#b2dfdb",2: "#80cbc4",3: "#4db6ac",4: "#26a69a",5: "#009688",6: "#00897b",7: "#00796b",8: "#00695c",9: "#004d40",},
    "green":       {0: "#e8f5e9",1: "#c8e6c9",2: "#a5d6a7",3: "#81c784",4: "#66bb6a",5: "#4caf50",6: "#43a047",7: "#388e3c",8: "#2e7d32",9: "#1b5e20",},
    "lit grn":     {0: "#f1f8e9",1: "#dcedc8",2: "#c5e1a5",3: "#aed581",4: "#9ccc65",5: "#8bc34a",6: "#7cb342",7: "#689f38",8: "#558b2f",9: "#33691e",},
    "lime":        {0: "#f9fbe7",1: "#f0f4c3",2: "#e6ee9c",3: "#dce775",4: "#d4e157",5: "#cddc39",6: "#c0ca33",7: "#afb42b",8: "#9e9d24",9: "#827717",},
    "yellow":      {0: "#fffde7",1: "#fff9c4",2: "#fff59d",3: "#fff176",4: "#ffee58",5: "#ffeb3b",6: "#fdd835",7: "#fbc02d",8: "#f9a825",9: "#f57f17",},
    "amber":       {0: "#fff8e1",1: "#ffecb3",2: "#ffe082",3: "#ffd54f",4: "#ffca28",5: "#ffc107",6: "#ffb300",7: "#ffa000",8: "#ff8f00",9: "#ff6f00",},
    "orange":      {0: "#fff3e0",1: "#ffe0b2",2: "#ffcc80",3: "#ffb74d",4: "#ffa726",5: "#ff9800",6: "#fb8c00",7: "#f57c00",8: "#ef6c00",9: "#e65100",},
    "dep ora":     {0: "#fbe9e7",1: "#ffccbc",2: "#ffab91",3: "#ff8a65",4: "#ff7043",5: "#ff5722",6: "#f4511e",7: "#e64a19",8: "#d84315",9: "#bf360c",},
    "brown":       {0: "#efebe9",1: "#d7ccc8",2: "#bcaaa4",3: "#a1887f",4: "#8d6e63",5: "#795548",6: "#6d4c41",7: "#5d4037",8: "#4e342e",9: "#3e2723",},
    "grey":        {0: "#fafafa",1: "#f5f5f5",2: "#eeeeee",3: "#e0e0e0",4: "#bdbdbd",5: "#9e9e9e",6: "#757575",7: "#616161",8: "#424242",9: "#212121",},
    "blu gry":     {0: "#eceff1",1: "#cfd8dc",2: "#b0bec5",3: "#90a4ae",4: "#78909c",5: "#607d8b",6: "#546e7a",7: "#455a64",8: "#37474f",9: "#263238",},
    }

In [None]:
df = pd.read_csv(f'data/clustering.csv', header=0)
df.rename(columns = {'Common':'TAG'}, inplace=True)
df = df[['Name','SMILES', 'TAG']]
df = df[df['TAG'].isin(['common_04', 'common_05'])]
display(df.head())
print(df.shape)

In [None]:
def fp_as_array(mol, n_bits=1024):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)
    arr = np.zeros((1,), int)  
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def fp_list_from_smiles_list(smiles_list, n_bits=2048):
    fp_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp_list.append(fp_as_array(mol, n_bits))
        else:
            print(f"Invalid SMILES: {smiles}")
    return fp_list

fp_list = fp_list_from_smiles_list(df.SMILES)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> PCA (Principal Component Analysis) </h2>
</div>

In [None]:
pca = PCA(n_components=2)
pca_data = pca.fit_transform(fp_list) 

pca_df = pd.DataFrame(pca_data,columns=["PC_1","PC_2"])
pca_df['TAG'] = list(df.TAG)
pca_df['SMILES'] = df['SMILES']

display(pca_df.head())
print(pca_df.shape)

In [None]:
def plot_pca_scatter(pca_df, unique_tags, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for tag in unique_tags:
        color = colors[tag]
        alpha = alphas[tag]
        ax = sns.scatterplot(data=pca_df.query(f"TAG == '{tag}'"), x="PC_1", y="PC_2", color=color, label=tag, alpha=alpha, s=20, ax=ax)

    ax.set_xlabel("pca_1", fontsize=12)
    ax.set_ylabel("pca_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=False)

    plt.title("PCA Scatter Plot", fontsize=14, fontweight='bold', color='navy')
    # plt.savefig("figures/pca_scatter.png", bbox_inches='tight')
    plt.show()

unique_tags = pca_df['TAG'].unique()
colors = {unique_tags[0]: '#2e7d32', unique_tags[1]: '#c62828'}
alphas = {unique_tags[0]: 0.7,       unique_tags[1]: 1}

plot_pca_scatter(pca_df, unique_tags, colors, alphas)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> PCA k-means Clustering </h2>
</div>

In [None]:
import numpy as np

kmeans_pca = pca_df.drop(['TAG', 'SMILES'], axis=1)
wcss = []
for k in range(1, 21):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(kmeans_pca)
    wcss.append(kmeans.inertia_)

slopes = [0]  
for i in range(1, len(wcss)):
    slope = (wcss[i] - wcss[i - 1]) / (i - (i - 1))
    slopes.append(slope)

plt.figure(figsize=(10, 4))
plt.plot(range(1, 21), wcss, marker='o', linestyle='-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal k')
plt.grid(True, which='both', linestyle='--', linewidth='0.7', color='gray', alpha=0.5)
plt.xticks(range(1, 21))

plt.twinx()
plt.plot(range(1, 21), slopes, marker='o', linestyle='--', color='#e57373')
plt.ylabel('Slope')
# plt.savefig("figures/pca_elbow_curve_with_slope.png", bbox_inches='tight')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=7, random_state=42)  
pca_df['cluster'] = kmeans.fit_predict(pca_df[['PC_1', 'PC_2']])
display(pca_df.head())
print(pca_df.columns)

In [None]:
def plot_pca_clusters(pca_df, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for cluster in pca_df['cluster'].unique():
        color = colors[cluster]
        alpha = alphas[cluster]
        ax = sns.scatterplot(data=pca_df[pca_df['cluster'] == cluster], x="PC_1", y="PC_2", color=color, label=f"Cluster {cluster}", alpha=alpha, s=20, ax=ax)

    ax.set_xlabel("PC_1", fontsize=12)
    ax.set_ylabel("PC_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=False)

    plt.title("PCA kmeans Clustering", fontsize=14, fontweight='bold', color='navy')
    # plt.savefig("figures/pca_kmeans_clustering.png", bbox_inches='tight')
    plt.show()

cluster_colors = {0: '#b71c1c', 1: '#4a148c', 2: '#1a237e', 3: '#01579b', 4: '#004d40', 5: '#33691e', 6: '#f57f17'}
cluster_alphas = {0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.7}

plot_pca_clusters(pca_df, cluster_colors, cluster_alphas)

In [None]:
def plot_kde_clusters(pca_df, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for cluster in pca_df['cluster'].unique():
        color = colors[cluster]
        alpha = alphas[cluster]
        ax = sns.scatterplot(data=pca_df[pca_df['cluster'] == cluster], x="PC_1", y="PC_2", color=color, label=f"Cluster {cluster}", alpha=alpha, s=20, ax=ax)
        sns.kdeplot(data=pca_df[pca_df['cluster'] == cluster], x="PC_1", y="PC_2", cmap="Blues", shade=True, shade_lowest=False, alpha=0.3, ax=ax)

    ax.set_xlabel("PC_1", fontsize=12)
    ax.set_ylabel("PC_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=True)

    plt.title("PCA Gussian KDE", fontsize=14, fontweight='bold', color='navy')
    plt.savefig("figures/pca_gussian_kde.png", bbox_inches='tight')
    plt.show()

cluster_colors = {0: '#b71c1c', 1: '#4a148c', 2: '#1a237e', 3: '#01579b', 4: '#004d40', 5: '#33691e', 6: '#f57f17'}
cluster_alphas = {0: 0.9, 1: 0.9, 2: 0.9, 3: 0.9, 4: 0.9, 5: 0.9, 6: 0.9}

plot_kde_clusters(pca_df, cluster_colors, cluster_alphas)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> t-SNE (t-distributed Stochastic Neighbor Embedding) </h2>
</div>

In [None]:
tsne_pca = PCA(n_components=2)
tsne_pca_result = tsne_pca.fit_transform(fp_list)

tsne = TSNE(n_components=2)
tsne_result = tsne.fit_transform(tsne_pca_result)
tsne_df = pd.DataFrame(tsne_result, columns=["TSNE_1", "TSNE_2"])

tsne_df['Name'] = df['Name']
tsne_df['TAG'] = df['TAG']              
tsne_df['SMILES'] = df['SMILES']

display(tsne_df.head())
print(tsne_df.shape)

In [None]:
def plot_tsne_scatter(tsne_df, unique_tags, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for tag in unique_tags:
        color = colors[tag]
        alpha = alphas[tag]
        ax = sns.scatterplot(data=tsne_df.query(f"TAG == '{tag}'"), x="TSNE_1", y="TSNE_2", color=color, label=tag, alpha=alpha, s=20, ax=ax)

    ax.set_xlabel("tsne_1", fontsize=12)
    ax.set_ylabel("tsne_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=False)

    plt.title("t-SNE Scatter Plot", fontsize=14, fontweight='bold', color='navy')
    # plt.savefig("figures/tsne_scatter.png", bbox_inches='tight')
    plt.show()

unique_tags = tsne_df['TAG'].unique()
colors = {unique_tags[0]: '#2e7d32', unique_tags[1]: '#c62828'}
alphas = {unique_tags[0]: 0.7,       unique_tags[1]: 1}

plot_tsne_scatter(tsne_df, unique_tags, colors, alphas)

In [None]:
kmeans_tsne = tsne_df.drop(['TAG', 'SMILES', 'Name'], axis=1)
wcss = []
for k in range(1, 21):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(kmeans_tsne)
    wcss.append(kmeans.inertia_)

slopes = [0]  
for i in range(1, len(wcss)):
    slope = (wcss[i] - wcss[i - 1]) / (i - (i - 1))
    slopes.append(slope)

plt.figure(figsize=(10, 4))
plt.plot(range(1, 21), wcss, marker='o', linestyle='-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal k')
plt.grid(True, which='both', linestyle='--', linewidth='0.7', color='gray', alpha=0.5)
plt.xticks(range(1, 21))

plt.twinx()
plt.plot(range(1, 21), slopes, marker='o', linestyle='--', color='#e57373')
plt.ylabel('Slope')
# plt.savefig("figures/tsne_elbow_curve_with_slope.png", bbox_inches='tight')
plt.show()

In [None]:
tsne_clusters = tsne_df.copy()

kmeans = KMeans(n_clusters=7, random_state=42) 
tsne_clusters['cluster'] = kmeans.fit_predict(tsne_clusters[['TSNE_1', 'TSNE_2']])

cluster_centroids = kmeans.cluster_centers_
cluster_centroid_mapping = {cluster_id: centroid for cluster_id, centroid in enumerate(cluster_centroids)}
tsne_clusters['Cluster Centroid'] = tsne_clusters['cluster'].map(cluster_centroid_mapping)

def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

tsne_clusters['distance from centroid'] = tsne_clusters.apply(lambda row: euclidean_distance(row['TSNE_1'], row['TSNE_2'], row['Cluster Centroid'][0], row['Cluster Centroid'][1]), axis=1)

tsne_clusters = tsne_clusters.sort_values(by=['cluster', 'distance from centroid'])

tsne_clusters['molecules_in_cluster_rankwise'] = tsne_clusters.groupby('cluster').cumcount() + 1
tsne_clusters['molecules_in_cluster_rankwise'] = 'cluster_' + tsne_clusters['cluster'].astype(str) + '_rank_' + tsne_clusters['molecules_in_cluster_rankwise'].astype(str)

cluster_counts = tsne_clusters['cluster'].value_counts()
ranks_to_keep = []
for i in range(7):
    cluster_size = cluster_counts[i]
    j_range = range(1, cluster_size, (cluster_size//7))  
    ranks_to_keep.extend([f'cluster_{i}_rank_{j}' for j in j_range])

tsne_clusters = tsne_clusters[tsne_clusters['molecules_in_cluster_rankwise'].isin(ranks_to_keep)]
tsne_clusters = tsne_clusters.drop(tsne_clusters.index[-1])

display(tsne_clusters.head())
print(tsne_clusters.shape)

In [None]:
def plot_tsne_clusters(tsne_df, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for cluster in tsne_df['cluster'].unique():
        color = colors[cluster]
        alpha = alphas[cluster]
        ax = sns.scatterplot(data=tsne_df[tsne_df['cluster'] == cluster], 
                             x="TSNE_1", y="TSNE_2", 
                             color=color, label=f"Cluster {cluster}", 
                             alpha=alpha, s=20, ax=ax)

    ax.set_xlim(tsne_df["TSNE_1"].min() - 5, tsne_df["TSNE_1"].max() + 5)
    ax.set_ylim(tsne_df["TSNE_2"].min() - 5, tsne_df["TSNE_2"].max() + 5)

    ax.set_xlabel("TSNE_1", fontsize=12)
    ax.set_ylabel("TSNE_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=False)

    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)

    plt.title("t-SNE kmeans Clustering", fontsize=14, fontweight='bold', color='navy')
    # plt.savefig("figures/tsne_kmeans_clustering.png", bbox_inches='tight')
    plt.show()

cluster_colors = {0: '#b71c1c', 1: '#4a148c', 2: '#1a237e', 3: '#01579b', 4: '#004d40', 5: '#33691e', 6: '#f57f17', 7: '#e65100', 8: '#3e2723'}
cluster_alphas = {0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.7, 7: 0.7, 8: 0.7}

plot_tsne_clusters(tsne_clusters, cluster_colors, cluster_alphas)

In [None]:
def plot_tsne_kde_clusters(tsne_clusters, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for cluster in tsne_clusters['cluster'].unique():
        color = colors[cluster]
        alpha = alphas[cluster]
        ax = sns.scatterplot(data=tsne_clusters[tsne_clusters['cluster'] == cluster], x="TSNE_1", y="TSNE_2", color=color, label=f"Cluster {cluster}", alpha=alpha, s=20, ax=ax)
        try:
            sns.kdeplot(data=tsne_clusters[tsne_clusters['cluster'] == cluster], x="TSNE_1", y="TSNE_2", cmap="Blues", shade=True, shade_lowest=False, alpha=0.3, ax=ax)
        except np.linalg.LinAlgError:
            print(f"Skipping KDE for cluster {cluster} due to singular covariance matrix.")
            
    ax.set_xlabel("TSNE_1", fontsize=12)
    ax.set_ylabel("TSNE_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=True)

    plt.title("TSNE Gussian KMeans KDE", fontsize=14, fontweight='bold', color='navy')
    plt.savefig("figures/tsne_gussian_kde.png", bbox_inches='tight')
    plt.show()

cluster_colors = {0: '#b71c1c', 1: '#4a148c', 2: '#1a237e', 3: '#01579b', 4: '#004d40', 5: '#33691e', 6: '#f57f17', 7: '#e65100', 8: '#3e2723'}
cluster_alphas = {0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.7, 7: 0.7, 8: 0.7}

plot_tsne_kde_clusters(tsne_clusters, cluster_colors, cluster_alphas)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> UMAP (Uniform Manifold Approximation and Projection) </h2>
</div>

In [None]:
def get_largest_fragment_from_smiles(s: str):
    mol = Chem.MolFromSmiles(s)
    return Chem.MolToSmiles(LargestFragmentChooser().choose(mol)) if mol else None

def compute_ecfp_descriptors(smiles_list: List[str]):
    keep_idx = []
    descriptors = []
    for i, smiles in enumerate(smiles_list):
        ecfp = _compute_single_ecfp_descriptor(smiles)
        if ecfp is not None:
            keep_idx.append(i)
            descriptors.append(ecfp)

    return np.vstack(descriptors), keep_idx

def _compute_single_ecfp_descriptor(smiles: str):
    mol = Chem.MolFromSmiles(smiles)
    return np.array(Chem.AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)) if mol else None

In [None]:
umap_df = pd.DataFrame()
umap_df["SMILES"] = df["SMILES"].apply(get_largest_fragment_from_smiles).dropna()
ecfp_descriptors, keep_idx = compute_ecfp_descriptors(umap_df["SMILES"])
umap_df = umap_df.iloc[keep_idx]

umap_model = umap.UMAP(metric="euclidean", n_neighbors=25, n_components=2, low_memory=False, min_dist=0.001)
X_umap = umap_model.fit_transform(ecfp_descriptors)

umap_df["UMAP_1"], umap_df["UMAP_2"] = X_umap[:, 0], X_umap[:, 1]
umap_df['TAG'] = df['TAG']
display(umap_df.head())
print(umap_df.shape)

In [None]:
def plot_umap_scatter(umap_df, unique_tags, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for tag in unique_tags:
        color = colors[tag]
        alpha = alphas[tag]
        ax = sns.scatterplot(data=umap_df.query(f"TAG == '{tag}'"), x="UMAP_1", y="UMAP_2", color=color, label=tag, alpha=alpha, s=20, ax=ax)

    ax.set_xlabel("umap_1", fontsize=12)
    ax.set_ylabel("umap_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=False)

    plt.title("UMAP Scatter Plot", fontsize=14, fontweight='bold', color='navy')
    # plt.savefig("figures/umap_scatter.png", bbox_inches='tight')
    plt.show()

unique_tags = umap_df['TAG'].unique()
colors = {unique_tags[0]: '#2e7d32', unique_tags[1]: '#c62828'}
alphas = {unique_tags[0]: 0.7,       unique_tags[1]: 1}

plot_umap_scatter(umap_df, unique_tags, colors, alphas)

In [None]:
kmeans_umap = umap_df.drop(['TAG', 'SMILES'], axis=1)
wcss = []
for k in range(1, 21):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(kmeans_umap)
    wcss.append(kmeans.inertia_)

slopes = [0]  
for i in range(1, len(wcss)):
    slope = (wcss[i] - wcss[i - 1]) / (i - (i - 1))
    slopes.append(slope)

plt.figure(figsize=(10, 4))
plt.plot(range(1, 21), wcss, marker='o', linestyle='-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal k') 
plt.grid(True, which='both', linestyle='--', linewidth='0.7', color='gray', alpha=0.5)
plt.xticks(range(1, 21))

plt.twinx()
plt.plot(range(1, 21), slopes, marker='o', linestyle='--', color='#e57373')
plt.ylabel('Slope')
# plt.savefig("figures/umap_elbow_curve_with_slope.png", bbox_inches='tight')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=7, random_state=42)  
umap_df['cluster'] = kmeans.fit_predict(umap_df[['UMAP_1', 'UMAP_2']])
display(umap_df.head())
print(umap_df.columns)

In [None]:
def plot_umap_clusters(umap_df, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for cluster in umap_df['cluster'].unique():
        color = colors[cluster]
        alpha = alphas[cluster]
        ax = sns.scatterplot(data=umap_df[umap_df['cluster'] == cluster], x="UMAP_1", y="UMAP_2", color=color, label=f"Cluster {cluster}", alpha=alpha, s=20, ax=ax)

    ax.set_xlabel("UMAP_1", fontsize=12)
    ax.set_ylabel("UMAP_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=False)

    plt.title("UMAP kmeans Clustering", fontsize=14, fontweight='bold', color='navy')
    # plt.savefig("figures/umap_kmeans_clustering.png", bbox_inches='tight')
    plt.show()

cluster_colors = {0: '#b71c1c', 1: '#4a148c', 2: '#1a237e', 3: '#01579b', 4: '#004d40', 5: '#33691e', 6: '#f57f17', 7: '#e65100', 8: '#3e2723', 9: '#263238', 10: '#9e9d24'}
cluster_alphas = {0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.7, 7: 0.7, 8: 0.7, 9: 0.7, 10: 0.7}

plot_umap_clusters(umap_df, cluster_colors, cluster_alphas)

In [None]:
def plot_umap_kde_clusters(umap_df, colors, alphas):
    plt.figure(figsize=(6, 6))
    ax = None
    for cluster in umap_df['cluster'].unique():
        color = colors[cluster]
        alpha = alphas[cluster]
        ax = sns.scatterplot(data=umap_df[umap_df['cluster'] == cluster], x="UMAP_1", y="UMAP_2", color=color, label=f"Cluster {cluster}", alpha=alpha, s=20, ax=ax)
        sns.kdeplot(data=umap_df[umap_df['cluster'] == cluster], x="UMAP_1", y="UMAP_2", cmap="Blues", shade=True, shade_lowest=False, alpha=0.3, ax=ax)

    ax.set_xlabel("UMAP_1", fontsize=12)
    ax.set_ylabel("UMAP_2", fontsize=12)
    ax.tick_params(axis='both', which='both', labelsize=10)
    ax.legend(fontsize=10, loc='upper right', frameon=True)

    plt.title("UMAP Gussian KDE", fontsize=14, fontweight='bold', color='navy')
    plt.savefig("figures/umap_gussian_kde.png", bbox_inches='tight')
    plt.show() 

cluster_colors = {0: '#b71c1c', 1: '#4a148c', 2: '#1a237e', 3: '#01579b', 4: '#004d40', 5: '#33691e', 6: '#f57f17', 7: '#e65100', 8: '#3e2723', 9: '#263238'}
cluster_alphas = {0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.7, 7: 0.7, 8: 0.7, 9: 0.7}

plot_umap_kde_clusters(umap_df, cluster_colors, cluster_alphas)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Collage all Images </h2>
</div>

In [None]:
from PIL import Image, ImageDraw

image_files = ['figures/pca_gussian_kde.png', 'figures/tsne_gussian_kde.png', 'figures/umap_gussian_kde.png']

images = [Image.open(file) for file in image_files]
widths, heights = zip(*(img.size for img in images))

border_size = 0  
collage_width = sum(widths) + (len(images) - 1) * border_size + 2 * border_size  
collage_height = max(heights)

collage = Image.new('RGB', (collage_width, collage_height), (255, 255, 255))
draw = ImageDraw.Draw(collage)

x_offset = border_size  
for idx, img in enumerate(images):
    collage.paste(img, (x_offset, 0))
    if idx < len(images) - 1:
        draw.rectangle([(x_offset + img.width, 0), (x_offset + img.width + border_size, collage_height)], fill=(0, 0, 0))
    x_offset += img.width + border_size

collage.show()
collage.save("figures/top_molecules.png")