In [1]:
import os
import sys
from pathlib import Path
import numpy as np

import pandas as pd
import datamol as dm

In [2]:
def generate_plate_wells(plate_type='384'):
    import string
    
    """
    Return a list of well addresses in row-major order
    for a standard 96 or 384 well plate.
    """
    if plate_type == '96':
        rows = list(string.ascii_uppercase[:8])    # A–H
        cols = list(range(1,13))                   # 1–12
    elif plate_type == '384':
        rows = list(string.ascii_uppercase[:16])   # A–P
        cols = list(range(1,25))                   # 1–24
    else:
        raise ValueError("Unsupported plate type")
    return [f"{r}{c}" for r in rows for c in cols]

# Organizing WorkFlow

In [3]:
cherry_pick_path = Path("./CherryPick.csv")
cherry_pick_df = pd.read_csv(cherry_pick_path)

reag_prod_path = Path("./Reactions_Products.csv")
reag_prod_df = pd.read_csv(reag_prod_path)

In [4]:
def convert_to_smiles(inchi):
    try:
        mol = dm.from_inchi(inchi)
        return dm.to_smiles(mol, canonical=True, isomeric=True, kekulize=True)
    except:
        return np.nan

In [5]:
pattern = r'^(R[1-4]|P1)$'
matching_cols = list(reag_prod_df.filter(regex=pattern).columns)

In [6]:
for col in matching_cols:
    reag_prod_df[col] = reag_prod_df[col].apply(convert_to_smiles)

In [7]:
# Get unique values for each column
r1_unique = reag_prod_df["R1"].unique()
r1_unique = np.append(r1_unique, [
    "CC(C)C(F)(F)C1=C(N)C=NC=C1",
    "CC(C)CC1=C(N)C=NC=C1",
    "CCCCN",
    "NCC1=CC=CC=C1",
    "CC(N)CC1=CC2=C(C=CC=C2)C=C1"
])

r2_unique = reag_prod_df["R2"].unique()
r3_unique = reag_prod_df["R3"].unique()

r4_unique = reag_prod_df["R4"].unique()
r4_unique = np.append(r4_unique, [
    "CCC(C)([N+]#[C-])C(C)C",
    "CC1=CC=C(C=C1)[N+]#[C-]",
    "[C-]#[N+]C1=CC=CC=C1",
    "CC1=CC=CC=C1[N+]#[C-]",
    "CCC\C=C(/C)[N+]#[C-]"
])

# Find the maximum length
max_len = max(len(r1_unique), len(r2_unique), len(r3_unique), len(r4_unique))

# Pad each array with np.nan to match the maximum length
def pad(arr, length):
    return np.concatenate([arr, [np.nan] * (length - len(arr))])

reactants_df = pd.DataFrame({
    "R1": pad(r1_unique, max_len),
    "R2": pad(r2_unique, max_len),
    "R3": pad(r3_unique, max_len),
    "R4": pad(r4_unique, max_len),
})

# Real WorkFlow

In [8]:
from itertools import product

In [9]:
sys.path.append("/home/hitesit/Python_Packages/General_Utils")
from Clusterer_GPT import Hierarchical_Clustering

In [10]:
def run_reaction(r1, r2, r3, r4):
    rxn = dm.reactions.rxn_from_block_file("./Ugi_4CR_reaction.rxn")
    
    product_lst = dm.reactions.apply_reaction(rxn, (r1, r2, r3, r4), single_product_group=True)
    if not isinstance(product_lst, list):
        return None
    if len(product_lst) == 1:
        return product_lst[0]
    else:
        return None

In [11]:
r1_list = [r for r in reactants_df["R1"].unique() if pd.notna(r)]
r2_list = [r for r in reactants_df["R2"].unique() if pd.notna(r)]
r3_list = [r for r in reactants_df["R3"].unique() if pd.notna(r)]
r4_list = [r for r in reactants_df["R4"].unique() if pd.notna(r)]

In [12]:
rows = []
for a, b, c, d in product(r1_list, r2_list, r3_list, r4_list):
    a_mol = dm.to_mol(a)
    b_mol = dm.to_mol(b)
    c_mol = dm.to_mol(c)
    d_mol = dm.to_mol(d)
    
    prod = run_reaction(a_mol, b_mol, c_mol, d_mol)
    
    row = {
        "R1": a,
        "R2": b,
        "R3": c,
        "R4": d,
        "Product": dm.to_smiles(prod)
    }
    rows.append(row)

In [13]:
df = pd.DataFrame(rows)
df

Unnamed: 0,R1,R2,R3,R4,Product
0,NC1=CN=C(O)C=C1,C=O,C=CC(=O)O,[C-]#[N+]C(C)(C)C,C=CC(=O)N(CC(=O)NC(C)(C)C)c1ccc(O)nc1
1,NC1=CN=C(O)C=C1,C=O,C=CC(=O)O,[C-]#[N+]C1CCCCC1,C=CC(=O)N(CC(=O)NC1CCCCC1)c1ccc(O)nc1
2,NC1=CN=C(O)C=C1,C=O,C=CC(=O)O,CCC(C)([N+]#[C-])C(C)C,C=CC(=O)N(CC(=O)NC(C)(CC)C(C)C)c1ccc(O)nc1
3,NC1=CN=C(O)C=C1,C=O,C=CC(=O)O,CC1=CC=C(C=C1)[N+]#[C-],C=CC(=O)N(CC(=O)Nc1ccc(C)cc1)c1ccc(O)nc1
4,NC1=CN=C(O)C=C1,C=O,C=CC(=O)O,[C-]#[N+]C1=CC=CC=C1,C=CC(=O)N(CC(=O)Nc1ccccc1)c1ccc(O)nc1
...,...,...,...,...,...
499,CC(N)CC1=CC2=C(C=CC=C2)C=C1,O=CC1CC1,C=C(C)C(=O)O,CCC(C)([N+]#[C-])C(C)C,C=C(C)C(=O)N(C(C)Cc1ccc2ccccc2c1)C(C(=O)NC(C)(...
500,CC(N)CC1=CC2=C(C=CC=C2)C=C1,O=CC1CC1,C=C(C)C(=O)O,CC1=CC=C(C=C1)[N+]#[C-],C=C(C)C(=O)N(C(C)Cc1ccc2ccccc2c1)C(C(=O)Nc1ccc...
501,CC(N)CC1=CC2=C(C=CC=C2)C=C1,O=CC1CC1,C=C(C)C(=O)O,[C-]#[N+]C1=CC=CC=C1,C=C(C)C(=O)N(C(C)Cc1ccc2ccccc2c1)C(C(=O)Nc1ccc...
502,CC(N)CC1=CC2=C(C=CC=C2)C=C1,O=CC1CC1,C=C(C)C(=O)O,CC1=CC=CC=C1[N+]#[C-],C=C(C)C(=O)N(C(C)Cc1ccc2ccccc2c1)C(C(=O)Nc1ccc...


In [14]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform

In [15]:
SOURCE_PLATE    = "Plate 1"                 # your source-plate barcode
DEST_PLATE      = "Final Plate 1"           # your destination-plate barcode
TRANSFER_NL     = 125                       # transfer volume in nL
OVERAGE_FACTOR  = 1.10                      # e.g. 10% dead‐volume buffer
NUM_WELLS       = 384                       # size of destination plate
ROWS            = list("ABCDEFGHIJKLMNOP")  # 16 rows
COLS            = list(range(1,25))         # 24 columns

In [16]:
unique_by_R = {
    comp: sorted(pd.unique(df[comp])) 
    for comp in ("R1","R2","R3","R4")
}

In [17]:
def make_wells(rows, cols):
    return [f"{r}{c}" for r in rows for c in cols]

In [18]:
COLS = list(range(1,25))

sector_cols = {
    "R1": COLS[0:6],    # columns 1–6 → 16×6 = 96 wells
    "R2": COLS[6:12],   # columns 7–12
    "R3": COLS[12:18],  # columns 13–18
    "R4": COLS[18:24],  # columns 19–24
}

In [19]:
NUM_WELLS

384

In [22]:
hc = Hierarchical_Clustering(df["Product"].tolist(), mode="diverse", feature_space="descriptor", num_clusters=384)
Z, reps = hc.hierarchical_clustering()

# _ = hc.plot_selected_similarity_matrix()
# hc.plot_dendrogram(Z)
# dm.to_image(reps)

[INFO] Using kmeans clustering
[INFO] Using KMeans for 504 samples with 5 features
[INFO] K-means clustering completed in 0.28 seconds


In [21]:
reagent_order = {}
for comp, reagents in unique_by_R.items():
    # build fingerprints
    mols = [Chem.MolFromSmiles(df[r]) for r in reagents]
    fps  = [AllChem.GetMorganFingerprintAsBitVect(m,2,1024) for m in mols]
    # pairwise Tanimoto distances
    pdist_vec = pdist([list(fp) for fp in fps], lambda x,y: 1 - DataStructs.TanimotoSimilarity(
        Chem.DataStructs.ExplicitBitVect(x), Chem.DataStructs.ExplicitBitVect(y)
    ))
    Z = linkage(pdist_vec, method="average")
    # dendrogram leaf‐order
    leaf_order = dendrogram(Z, no_plot=True)["leaves"]
    reagent_order[comp] = [reagents[i] for i in leaf_order]

KeyError: 'CC(C)C(F)(F)C1=C(N)C=NC=C1'

In [None]:
products_smiles = df["Product"].tolist()
products_mol = [dm.to_mol(smi) for smi in products_smiles]

In [None]:
hc = Hierarchical_Clustering(products_smiles, mode="similar", feature_space="fingerprint", num_clusters=5)
Z, reps = hc.hierarchical_clustering()
_ = hc.plot_selected_similarity_matrix()
hc.plot_dendrogram(Z)
dm.to_image(reps)

In [None]:
hc = Hierarchical_Clustering(products_smiles, mode="diverse", feature_space="fingerprint")
Z, reps = hc.hierarchical_clustering()
_ = hc.plot_selected_similarity_matrix()
hc.plot_dendrogram(Z)
dm.to_image(reps)