In [None]:
import pandas as pd

In [2]:
df_lit = pd.read_csv('literature_reactions.tsv',sep='\t')
# df_lit.head

lit_halides = df_lit['startingmat_1_smiles']
print(lit_halides.nunique())
unique_halides = lit_halides.unique()

4416


### Specifically selecting (hetero)aryl bromides that don't have a chloride or iodide which can competitively react, so that analysis of reaction yield will not require considering multiple products.

In [3]:
bromides = [f for f in unique_halides if f.count("Br") == 1 and "Cl" not in f and "I" not in f and "." not in f and "B(O)" not in f and "OB" not in f and "BO" not in f] #Making sure boronic acids are also not included
len(bromides)

1440

### As with Bromides, we want chlorides with one reaction site (and no boronic acid sites; these are common)

In [4]:
chlorides = [f for f in unique_halides if f.count("Cl")==1 and "Br" not in f and "I" not in f and "." not in f and "B(O)" not in f and "OB" not in f and "BO" not in f]
len(chlorides)

2379

### Just for clarity sake - here are the column names for the literature dataframe

In [5]:
df_lit.columns
pdts = df_lit['product_1_smiles']
# print(pdts) ## If you want to see the products
# print(chlorides[1]) ## If you want to see an example of a chloride
electrophiles = bromides + chlorides # We are only interested in bromides and chlorides for this analysis
print(len(electrophiles))

3819


In [6]:
lit_amines = df_lit['startingmat_2_smiles']
print(lit_amines.nunique())
unique_amines = lit_amines.unique()
pruned_amines = [f for f in unique_amines if f.count("Br") == 0 and "Cl" not in f and "I" not in f and "." not in f and "B(O)" not in f and "OB" not in f and "BO" not in f and "Sn" not in f]
print(f"This many amines are left after removing those with halides or boronic acid sites: {len(pruned_amines)}")

3044
This many amines are left after removing those with halides or boronic acid sites: 2737


### Now we're going to prune the list of amines down to only primary amine nucleophiles using RDKit atom attributes. 

In [7]:
from rdkit.Chem import rdmolfiles as rdm
### Here's some example code to check the valence of nitrogens in a molecule
# exam = pruned_amines[0]
# exmol = rdm.MolFromSmiles(exam)
# print(exam)
# nitrogens = [f for f in exmol.GetAtoms() if f.GetSymbol() == "N"]
# print(nitrogens[0].GetImplicitValence())
# del exam,exmol,nitrogens
###
implicit_val = []
nitrogen_indices = []
selected_amines = []
pruned_out = []
nuc_rxn_sites = []
for m in pruned_amines:
    mol = rdm.MolFromSmiles(m)
    nitrogens = [f for f in mol.GetAtoms() if f.GetSymbol() == "N"]
    # print(len(nitrogens),m)
    nit_valences = [str(f.GetImplicitValence()) for f in nitrogens]
    nit_idx = [str(f.GetIdx()) for f in nitrogens]
    print(nit_idx,m)
    if nit_valences.count("2") == 1: #Single nitrogen with a valence of 2 (primary), any number of secondary or tertiary nitrogens allowed
        implicit_val.append(".".join(nit_valences))
        nitrogen_indices.append(".".join(nit_idx))
        selected_amines.append(m)
        nuc_rxn_sites.append(nit_idx[nit_valences.index("2")])
    else:
        pruned_out.append(f"{m},{nit_idx},{'.'.join(nit_valences)}")

secondary_and_primary = []
for i,j in zip(selected_amines,implicit_val):
    if "1" in j:
        secondary_and_primary.append(i)

print(f"Number of amines with a nitrogen valence of 2 and of valence 1: {len(secondary_and_primary)}")
    
    

['2'] C[C@H](N)c1ccccc1
['0', '7'] NC(=O)c1cccnc1
['0', '3'] NCCN1CCCCC1
['0', '3'] NCCN1CCOCC1
['1', '5'] CN(C)CCN
['0', '9'] N[C@@H](c1ccccc1)[C@@H](Nc1ccccc1-c1ccccc1)c1ccccc1
['13', '22'] CC(C)(C)c1cc(-c2ccccc2N[C@@H](c2ccccc2)[C@@H](N)c2ccccc2)cc(C(C)(C)C)c1
['2'] CC(N)=O
['2'] CS(N)(=O)=O
['1', '4'] CN(C)S(N)(=O)=O
['6'] Cc1ccc(S(N)(=O)=O)cc1
['6'] CCOC(=O)C(N)=O
['0', '6'] Nc1ccccn1
['0', '2', '7'] Nc1ncccc1[N+](=O)[O-]
['6'] COc1ccc(N)cc1
['0'] Nc1ccc2ccccc2c1
['6'] CCc1ccc(N)cc1
['0'] Nc1ccccc1CO
['0'] Nc1ccccc1OCc1ccccc1
['0'] Nc1ccccc1F
['0'] Nc1ccc(CO)cc1
['0'] Nc1cccc(CO)c1
['0'] Nc1ccccc1CCO
['7'] COc1cccc(N)c1
['8'] COc1ccccc1N
['0'] Nc1ccccc1
['0', '7'] Nc1ccccc1N
['0'] Nc1ccc(CCO)cc1
['0', '7'] Nc1ccccc1Nc1ccccc1
['0', '5', '6'] Nc1cccnn1
['2', '5', '6'] Cc1nsc(N)n1
['6', '7'] Cc1cccc(N)n1
['0'] NC1CCCCC1
['0'] Nc1ccc(F)cc1
['3', '4'] C[C@H](CN)NC(=O)OC(C)(C)C
['7', '10'] CC(C)(C)OC(=O)N1[C@@H](CN)COC1(C)C
['5', '6'] CCC[C@H](CN)NC(=O)OC(C)(C)C
['7', '10'] CC(C)(C)OC(=

### We can inspect the results of this pruning.

In [8]:
pd.DataFrame(secondary_and_primary).to_csv("secondary_and_primary_amines.csv",index=False) #Saving this for inspection. Manual inspection shows that most of the secondary sites are amido or arylamine sites, which are not nucleophilic enough to be competitive with primary sites
print(len(pruned_out))
print(len(selected_amines))

74
2663


### Now we need to set up csv files for the somn package to calculate descriptors.

In [9]:
el_role = ["el" for f in electrophiles]
nu_role = ["nuc" for f in selected_amines]
selected_df = pd.DataFrame()
selected_df['smiles']=electrophiles+selected_amines
selected_df['type']=el_role+nu_role
# selected_df.reset_index(inplace=True)
# selected_df.head
rxn_site = ["-" for f in electrophiles]
rxn_site.extend(nuc_rxn_sites)
selected_df['sites'] = rxn_site
selected_df['name']= ["el"+str(f+1) for f in range(len(electrophiles))]+["nuc"+str(f+1) for f in range(len(selected_amines))]
print(selected_df[['name','smiles','type']])
selected_df[['name','smiles','type','sites']].to_csv("selected_commercial_mols.csv",header=True,index=False)
print(nuc_rxn_sites)

         name                                        smiles type
0         el1                                  COc1ccccc1Br   el
1         el2   CCOC(=O)c1cnc2cc(OC)c(Br)cc2c1Nc1ccc(F)cc1F   el
2         el3   CCOC(=O)c1cnc2cc(Br)c(OC)cc2c1Nc1ccc(F)cc1F   el
3         el4                           Cc1cc(C)c(Br)c(C)c1   el
4         el5  COc1c(-c2nnc(Cc3ccc(F)cc3)o2)nc(Br)c2cccnc12   el
...       ...                                           ...  ...
6477  nuc2659          CC(C)(C)OC(=O)NC1CCCN(c2ccc(N)cc2)C1  nuc
6478  nuc2660                            CCC(=O)c1ccc(N)cc1  nuc
6479  nuc2661                   C/C(=N\N)c1cccc(C(F)(F)F)c1  nuc
6480  nuc2662                      N#Cc1cccc(N2CCC(N)CC2)c1  nuc
6481  nuc2663             NC1CCN(c2ccc([N+](=O)[O-])cc2)CC1  nuc

[6482 rows x 3 columns]
['2', '0', '0', '0', '5', '0', '22', '2', '2', '4', '6', '6', '0', '0', '6', '0', '6', '0', '0', '0', '0', '0', '0', '7', '8', '0', '0', '0', '0', '5', '6', '0', '0', '3', '10', '5', '10', '3', '

### In order to facilitate faster calculation times, this large list is being split into multiple smaller input lists

In [None]:
from copy import deepcopy 
to_split = deepcopy(selected_df[['name','smiles','type','sites']])
size = 400 #Choose this to determine the number of batches
list_of_dfs = [to_split.loc[i:i+size-1,:] for i in range(0, len(to_split),size)]
print(list_of_dfs[-1].head)

for i,f in enumerate(list_of_dfs):
    f.to_csv(f"selected_commercial_mols_{i}_request.csv",header=True,index=False)

<bound method NDFrame.head of          name                                     smiles type sites
6000  nuc2182  CC(C)(C)OC(=O)N1CCCC(NC(=O)c2cccc(N)c2)C1  nuc    20
6001  nuc2183                   CCN1CCN(c2ccc(N)cc2F)CC1  nuc    10
6002  nuc2184             Nc1ccnc(-c2ccnc(C(F)(F)F)c2)c1  nuc     0
6003  nuc2185          Cc1cc(N)ccc1N1CCC(N2CCN(C)CC2)CC1  nuc     4
6004  nuc2186           CN1CCN(C2CCN(c3ccc(N)cn3)CC2)CC1  nuc    13
...       ...                                        ...  ...   ...
6477  nuc2659       CC(C)(C)OC(=O)NC1CCCN(c2ccc(N)cc2)C1  nuc    17
6478  nuc2660                         CCC(=O)c1ccc(N)cc1  nuc     8
6479  nuc2661                C/C(=N\N)c1cccc(C(F)(F)F)c1  nuc     3
6480  nuc2662                   N#Cc1cccc(N2CCC(N)CC2)c1  nuc    11
6481  nuc2663          NC1CCN(c2ccc([N+](=O)[O-])cc2)CC1  nuc     0

[482 rows x 4 columns]>


# Next, the input lists are submitted to somn calculations following the somn package documentation. This can be done using a docker image or using a local installation of somn following the instructions below.

## Standalone
1. Install the somn package per instructions that can be found at the main branch readme: 
https://github.com/SEDenmarkLab/Lucid_Somnambulist

2. Use the input file(s) created by this notebook (the enumerated .csv files) as an input, to follow the "Getting Started" documentation on the somn repository. For JUST calculating descriptors, you don't need to download pretrained models.* To get started, we recommend using git. After installing molli-0.2.3 (i.e., with git, you can run the command: git clone https://github.com/SEDenmarkLab/molli_firstgen.git --branch molli-0.2.3 --single-branch --depth 1), you should install somn (git clone https://github.com/SEDenmarkLab/Lucid_Somnambulist.git). Then, you need to create a first "project" in somn within which to work. 

Place each input file into a working directory (assuming concurrent jobs) with a filename ending in "_request.csv". Anything can come before "_request" in the filename, but the ending of the file must be "_request.csv". 

Within each working directory (i.e., at the level of each somn_scratch folder) run the following command:

somn calculate {your_file_name}_request.csv

*If you wish to get the pretrained models, those can be found at https://zenodo.org/records/12729648/files/pretrained-somn.tar.gz?download=1 and an accompanying json file that somn needs to use them: https://zenodo.org/records/12729648/files/projects.JSON?download=1). 



## Docker
1. For docker, use the "Running the job" instructions in the repository readme. This will involve dumping the contents of your local input file inside of a running container based on the somn image, which you will need to know how to generate using docker and the somn image on dockerhub (ianrinehart/somn:1.2.1). With a running container, the command below will place the input file contents inside of the container:

cat {your local input file - this will be "selected_commercial_mols_#.csv" for this notebook} | docker exec -i {your running container} sh -c 'cat > /tmp/somn_root/somn_scratch/IID-Models-2024/scratch/test_request.csv'

Execute the job in docker:

docker exec -i {your running container} micromamba run somn predict last latest {name your prediction set - this will only affect the location where you fetch the results, but does not really matter....i.e., asdf is fine}

Get the results from inside the container at this location (it's a folder, just grab the whole contents):

/tmp/somn_root/somn_scratch/IID-Models-2024/outputs/{your prediction set name}/couplings/

After fetching the results from the running container, you should close it. You're done! :)
