In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/conda-forge/miniforge/releases/download/23.1.0-1/Mambaforge-23.1.0-1-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:17
🔁 Restarting kernel...


In [None]:
!conda install -c rdkit rdkit

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
Solving environment: | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | /

# **Import libraries**

In [None]:
import rdkit
import pandas as pd
import numpy as np
import os

# **ChemDiv library of Serine Proteases Inhibitors**
https://www.chemdiv.com/catalog/focused-and-targeted-libraries/serine-proteases-inhibitors-library/

After converting the structures to smi file, you can modify/run the following cell as needed.

In [None]:
!unzip SPIL.zip

Archive:  SPIL.zip
  inflating: SPIL.smi                


In [None]:
%%bash

split -l 5000 --additional-suffix=.csv SPIL.smi SPIL_

first=1
for f in SPIL_*.csv;
do
  if [ $first -eq 1 ]; then
    first=0
  else
    echo "Smiles" > tmpfile; cat $f >> tmpfile; mv tmpfile $f;
  fi
done


In [None]:
lib = pd.read_csv('SPIL_af.csv')
lib

Unnamed: 0,Smiles
0,c1(n(c(=O)c2c(n1)ccs2)c1ccc(cc1)F)N1CCC(C(=O)N...
1,c1(n(c(=O)c2c(n1)ccs2)c1ccc(cc1)F)N1CCC(C(=O)N...
2,n1(c(nc2c(c1=O)scc2)N1CCC(C(=O)Nc2ccc(Cl)cc2)C...
3,n1(c(nc2c(c1=O)scc2)N1CCC(C(=O)NCc2occc2)CC1)c...
4,n1(c(nc2c(c1=O)scc2)N1CCC(C(=O)NCc2ccccc2)CC1)...
...,...
4995,c12c(c(=O)[nH]c(n1)N1CC(C(=O)O)CCC1)scc2c1ccc(...
4996,c12c(c(=O)[nH]c(n1)N1CCC(C(=O)NCCC(C)C)CC1)scc...
4997,c12c(c(=O)[nH]c(n1)N1CCC(C(=O)NC3CC3)CC1)scc2c...
4998,c12c(c(=O)[nH]c(n1)N1CCC(C(=O)NC3CCCCCC3)CC1)s...


**Generate Morgan fingerprints**


In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem

def Morgan(Smiles):
  fingerprints = []
  for smile in Smiles:
    ml = Chem.MolFromSmiles(smile)
    Fp = AllChem.GetMorganFingerprintAsBitVect(ml,2,1024)
    arr = np.array(Fp)
    fingerprints.append(arr)
    Morgan_array = np.array(fingerprints)
  return Morgan_array

In [None]:
directory = '/content/'
final = pd.DataFrame()
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
      df = pd.read_csv(os.path.join(directory, filename))
      print(filename, 'is being processed now...')
      Fingerprints = Morgan(df["Smiles"])
      df_Fp = pd.DataFrame(Fingerprints, columns= ["Bit_{}".format(i)
                      for i in range(Fingerprints.shape[1])])
      df1 = pd.concat([df, df_Fp], axis=1)
      final = pd.concat([final, df1], ignore_index=True)
      print(filename, 'is done!')
      del df1

SPIL_ad.csv is being processed now...
SPIL_ad.csv is done!
SPIL_ae.csv is being processed now...
SPIL_ae.csv is done!
SPIL_ag.csv is being processed now...
SPIL_ag.csv is done!
SPIL_ab.csv is being processed now...
SPIL_ab.csv is done!
SPIL_aa.csv is being processed now...
SPIL_aa.csv is done!
SPIL_ac.csv is being processed now...
SPIL_ac.csv is done!
SPIL_af.csv is being processed now...
SPIL_af.csv is done!


In [None]:
final

Unnamed: 0,Smiles,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,...,Bit_1014,Bit_1015,Bit_1016,Bit_1017,Bit_1018,Bit_1019,Bit_1020,Bit_1021,Bit_1022,Bit_1023
0,c1(n(nc(c1)C)c1ccc(NS(=O)(=O)c2ccc(C(=O)OCC)cc...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,S(=O)(=O)(c1ccc(C(=O)N2CCN(c3c4c(cc(cc4)Cl)ncc...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C12(c3c(c4c([nH]3)cccc4)CCN1C(=O)OC12CCCCC1)C,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C(=O)(Nc1c(C(=O)Cc2ccccc2)cccc1)c1sccc1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C1(C(=O)OCC)(CCN(CC(=O)Nc2ccc(Cl)cc2)CC1)c1ccccc1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31967,c12c(c(=O)[nH]c(n1)N1CC(C(=O)O)CCC1)scc2c1ccc(...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
31968,c12c(c(=O)[nH]c(n1)N1CCC(C(=O)NCCC(C)C)CC1)scc...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
31969,c12c(c(=O)[nH]c(n1)N1CCC(C(=O)NC3CC3)CC1)scc2c...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
31970,c12c(c(=O)[nH]c(n1)N1CCC(C(=O)NC3CCCCCC3)CC1)s...,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
final.to_csv('SPIL_Morganfp2_1024.csv', index=False)

In [None]:
! zip 'SPIL_Morgan.zip' SPIL_Morganfp2_1024.csv

  adding: SPIL_Morganfp2_1024.csv (deflated 96%)


In [None]:
%%bash
rm SPIL_a*.csv

# **2. Screening**

In [None]:
import pickle

In [None]:
!unzip Models.zip

Archive:  Models.zip
  inflating: SVM_model.pkl           
  inflating: XGB_model.pkl           
  inflating: Rf_model.pkl            


## **Random forest**

In [None]:
with open('Rf_model.pkl', 'rb') as f:
  rf = pickle.load(f)

In [None]:
pred_proba_rf = rf.predict_proba(final.iloc[:, 1:])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
active_proba_rf = []

for row in pred_proba_rf:
  active_proba_rf.append(row[1])
  res_rf = pd.DataFrame(active_proba_rf)
res_rf.head()

## **XGBoost**

In [None]:
with open('XGB_model.pkl', 'rb') as f:
  xgb = pickle.load(f)

In [None]:
pred_proba_xgb = xgb.predict_proba(final.iloc[:, 1:])

active_proba_xgb = []

for row in pred_proba_xgb:
  active_proba_xgb.append(row[1])
  res_xgb = pd.DataFrame(active_proba_xgb)
res_xgb.head()

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,0
0,0.000806
1,0.036547
2,0.011355
3,0.00727
4,0.000548


# **SVM**

In [None]:
with open('SVM_model.pkl', 'rb') as f:
  svm = pickle.load(f)

In [None]:
pred_proba_svm = svm.predict_proba(final.iloc[:, 1:])


active_proba_svm = []

for row in pred_proba_svm:
  active_proba_svm.append(row[1])
  res_svm = pd.DataFrame(active_proba_svm)
  res_svm.head()

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
# Results into csv file

result = pd.concat([final['Smiles'], res_rf, res_xgb, res_svm], axis=1)
result.columns=['Smiles', 'Rf', 'XGB', 'SVM']
result['Mean'] = result.loc[:, ['Rf', 'XGB', 'SVM']].mean(axis=1)

In [None]:
actives = result[result['Mean']>= 0.5]
actives

Unnamed: 0,Smiles,Rf,XGB,SVM,Mean
8,c1(c[nH]c2c1cccc2)CC(C(=O)NCCOc1ccc(cc1)OC)NC(...,0.429699,0.910285,0.355700,0.565228
68,n1c(noc1CCC(=O)Nc1cc(C(=O)C)ccc1)c1cc(c(cc1)OC)OC,0.342695,0.944872,0.307068,0.531545
123,c1(cc(no1)c1cc(Cl)ccc1)C(=O)Nc1cc(F)ccc1,0.456465,0.961294,0.466721,0.628160
172,C(=O)(C(NC(=O)CNC(=O)C)c1ccccc1)Nc1ccc(cc1)OC,0.547004,0.997028,0.594120,0.712717
173,N1(C(=O)OCc2ccccc2)C(C(=O)NC(C(=O)Nc2ccc(cc2)O...,0.668118,0.995537,0.605571,0.756408
...,...,...,...,...,...
31602,C(=O)(c1cnc(C(=O)Nc2cc(C(=O)C)ccc2)cc1)Nc1cc(C...,0.369772,0.754484,0.415922,0.513393
31655,n1c(C(=O)Nc2ccc(C(=O)C)cc2)cccc1C(=O)Nc1ccc(C(...,0.330825,0.899996,0.467415,0.566079
31664,c1(nc2c(s1)cc(cc2)C)c1ccc(NC(=O)C)cc1,0.467202,0.741677,0.589193,0.599357
31674,c1(sc2c(c1Cl)ccc(c2)F)C(=O)Nc1cc2c(OCO2)cc1,0.357189,0.847033,0.405148,0.536456


In [None]:
actives.to_csv('actives_ML.csv', index=False)