# codebase

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns


class target_curate:
    def __init__(self, data, target_name_col, target_name, target_org_col, target_org,
                 type_col, unit_col, active_col, relate_col, type_arg, MW, equal_only = False, thresh = 7):
        self.data = data.reset_index(drop=True)
        self.target_name_col = target_name_col 
        self.target_name = target_name
        self.target_org_col=target_org_col
        self.target_org = target_org
        self.type_col = type_col
        self.unit_col = unit_col
        self.active_col = active_col
        self.relate_col = relate_col
        self.type_arg = type_arg
        self.equal_only = equal_only
        self.thresh = thresh
        self.MW = MW
        
    def target_filter(self, data, target_name_col, target_name, target_org_col, target_org):
        df= data[data[target_name_col]==target_name]
        display(df.shape)
        df2 = df[df[target_org_col]==target_org]
        display(df2.shape)
        return df2
        
    def standardize_value(self, data, type_col, type_arg, unit_col):
        df = data[data[type_col]==type_arg]
        df= df.dropna(subset =unit_col)
        df.reset_index(drop=True, inplace = True)
        type = ['ug ml-1', 'ug.mL-1', 'nM', 'uM', 'umol/L', 'mM', 'M', 'mg/L']
        idx = []
        for key, value in enumerate(df[unit_col]):
            if value in type:
                idx.append(key)
        df = df.iloc[idx,:]
        return df
    
    def convert_activity(self, data, active_col, unit_col, MW_col):
        df = data.copy()
        df['pChEMBL'] = np.zeros(len(df))
        #unit = df['Unit'].unique()

        for key, value in enumerate(df[unit_col]):
            conc_value = df.loc[key, active_col]
            mw_value   = df.loc[key, MW_col]
            if value == 'μM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-6)
            elif value  == 'uM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-6)
            elif value  == 'umol/L':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-6)
            elif value  == 'nM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-9)
            elif value  == 'nmol/l':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-9)
            elif value  == 'mM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-3)
            elif value  == 'M':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1)
            elif value.lower() in ('µg/ml', 'ug/ml', 'ug.ml-1', 'ug.ml-1', 'ug ml-1', 'mg/l', 'mg/L'):
                # µg/mL → g/L: conc_value * 1e-3
                #       → mol/L: (conc_value * 1e-3) / MW
                if mw_value <= 0:
                    # Nếu MW không hợp lệ (<=0), không thể tính pChEMBL
                    df.loc[key, 'pChEMBL'] = np.nan
                else:
                    conc_molar = (conc_value * 1e-3) / mw_value
                    # Tránh log của giá trị <= 0
                    if conc_molar > 0:
                        df.loc[key, 'pChEMBL'] = -np.log10(conc_molar)
                    else:
                        df.loc[key, 'pChEMBL'] = np.nan
                        
            elif value  == 'no unit':
                df.loc[key, 'pChEMBL'] = -df.loc[key, active_col]
        return df
    
    
    def standardize_relation(self, data,relate_col,  equal_only, thresh):
        df = data.copy()
        df.dropna(subset = relate_col, inplace = True)
        if equal_only == True:
            print('SELECTING ONLY EQUAL')
            df = df[df[relate_col]=="'='"]
            
        else:
            print('HANDLING')
            df_big = df[(df[relate_col] == "'>'") | (df[relate_col] == "'>='")]
            df_small = df[(df[relate_col] == "'<'") | (df[relate_col] == "'<='")]
            df_equal = df[df[relate_col]=="'='"]
                
            #Drop pCHEMBL < thresh for df_big
            drop_idx = df_big[df_big["pChEMBL"] < thresh].index
            df_big.drop(drop_idx, inplace = True)
                
            #Drop pCHEMBL > thresh for df_small
            drop_idx = df_small[df_small["pChEMBL"] > thresh].index
            df_small.drop(drop_idx, inplace = True)
                
            df = pd.concat((df_equal, df_small, df_big), axis = 0)
        return df
                  
    def curated_fit(self):
        print("Number of data before target curation:", self.data.shape[0])
        df = self.target_filter(data = self.data, target_name_col = self.target_name_col, target_name =self.target_name, 
                           target_org_col=self.target_org_col, target_org = self.target_org)
        df.reset_index(drop=True, inplace = True)
        print("Number of data after handle organism and target name:", df.shape[0])
        df1 = self.standardize_value(data=df, type_col=self.type_col, type_arg=self.type_arg, unit_col=self.unit_col)
        df1.reset_index(drop=True, inplace = True)
        print("Number of data after select unit:", df1.shape[0])
        #display(df1.head(5))
        df2 = self.convert_activity(data=df1, active_col=self.active_col, unit_col = self.unit_col, MW_col=self.MW)
        df2.reset_index(drop=True, inplace = True)
        #display(df2.head(5))
        df3 = self.standardize_relation(data=df2,relate_col=self.relate_col,  equal_only=self.equal_only, thresh=self.thresh)
        self.df = df3
        self.df.reset_index(drop=True, inplace = True)
        print("Number of data after standardizing:", self.df.shape[0])
        

In [4]:
from rdkit import Chem

class smile_curate:
    def __init__(self, data, smile_col, pchem_col, keep = 'best'):
        self.data = data
        self.smile_col = smile_col
        self.pchem_col = pchem_col
        self.keep = keep
    def smile_norm(self, data, smile_col):
        df = data.dropna(subset = smile_col)
        def safe_canonical(smiles):
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    # Nếu RDKit không parse được, trả NaN
                    return np.nan
                # MolToSmiles với useChiral=True để lưu thông tin stereo nếu có
                return Chem.MolToSmiles(mol, isomericSmiles=True)
            except Exception:
                # Bất kỳ lỗi nào khác cũng trả NaN
                return np.nan

        # Áp dụng hàm safe_canonical lên từng dòng
        df['Canonical_Smiles'] = df[smile_col].apply(safe_canonical)
        
        return df

    def curate(self):
        df = self.smile_norm(data=self.data, smile_col=self.smile_col)
        if self.keep == 'best':
            df = df.sort_values(by=self.pchem_col, ascending=False)
            df_dropdup = df.drop_duplicates(subset=['Canonical_Smiles'], keep="first")
        elif self.keep == 'worst':
            df = df.sort_values(by=self.pchem_col, ascending=True)
            df_dropdup = df.drop_duplicates(subset=['Canonical_Smiles'], keep="first")
           
        print(df_dropdup.shape)
        self.df = df_dropdup

In [5]:
class assay_curate:
    def __init__(self,data, type_col, org_col,des_col, type_arg='F', org_arg='Homo sapiens', kw = 'MTT'):
        self.data = data
        self.type_col = type_col
        self.org_col = org_col
        self.des_col = des_col
        self.type_arg= type_arg
        self.org_arg= org_arg
        self.kw = kw 
    
    def search_kw(self, data,kw, des_col):
        index = []
        for key, value in enumerate(data[des_col]):
            if kw in value:
                index.append(key)
        return data.iloc[index,:]
    
    def curated_fit(self):
        print("Number of data befor standardizing:", self.data.shape[0])
        df = self.data[self.data[self.type_col]==self.type_arg]
        print("Number of data after choosing assay type:", df.shape[0])
        df2 = df[df[self.org_col]==self.org_arg]
        print("Number of data after choosing assay organism:", df2.shape[0])
        df3 = self.search_kw(data=df2, kw = self.kw, des_col = self.des_col)
        print("Number of data after curating:", df3.shape[0])
        self.df = df3

# Data curation

## 1. Load data

In [11]:
data = pd.read_csv('Data/assay/PA.csv')
data.head(5)

  data = pd.read_csv('Data/assay/PA.csv')


Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
0,CHEMBL146445,,,481.62,0.0,3.3,28,CC(C)c1csc(CCc2ccn3c(=O)c(/C=C/C(=O)O)c(N4CCCN...,MPC8,'=',...,CHEMBL1147403,1,Scientific Literature,Bioorg Med Chem Lett,2004.0,,,,,128.0
1,CHEMBL146445,,,481.62,0.0,3.3,28,CC(C)c1csc(CCc2ccn3c(=O)c(/C=C/C(=O)O)c(N4CCCN...,MPC8,'=',...,CHEMBL1147403,1,Scientific Literature,Bioorg Med Chem Lett,2004.0,,,,,128.0
2,CHEMBL56252,PABETAN,,446.56,1.0,2.1,1,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',...,CHEMBL1132113,1,Scientific Literature,J Med Chem,1999.0,,,,,2.0
3,CHEMBL56252,PABETAN,,446.56,1.0,2.1,1,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',...,CHEMBL1132113,1,Scientific Literature,J Med Chem,1999.0,,,,,0.5
4,CHEMBL63609,,,513.56,1.0,-1.05,38,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',...,CHEMBL1124865,1,Scientific Literature,J Med Chem,1990.0,,,,,64.0


In [12]:
data.shape

(130756, 48)

In [26]:
columns = ['Molecule ChEMBL ID', 'Molecular Weight', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units', 'pChEMBL Value', 'Data Validity Comment',
          'Assay Type', 'Assay Description','Assay Organism', 'Assay Variant Mutation','Target Name','Target Organism', 'Document Journal']

df = data[columns]
df.head(5)

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal
0,CHEMBL146445,481.62,CC(C)c1csc(CCc2ccn3c(=O)c(/C=C/C(=O)O)c(N4CCCN...,MPC8,'=',128.0,ug ml-1,,,F,Minimum inhibitory concentration against Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Bioorg Med Chem Lett
1,CHEMBL146445,481.62,CC(C)c1csc(CCc2ccn3c(=O)c(/C=C/C(=O)O)c(N4CCCN...,MPC8,'=',128.0,ug ml-1,,,F,Minimum inhibitory concentration against Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Bioorg Med Chem Lett
2,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',2.0,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem
3,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',0.5,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem
4,CHEMBL63609,513.56,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',64.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem


In [28]:
df[df['Molecule ChEMBL ID']=='CHEMBL131854']

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal
8,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,MIC,'=',0.5,ug.mL-1,,,F,In vitro minimum inhibitory concentration agai...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
39430,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,MIC,'=',0.25,ug.mL-1,,,F,In vitro minimum inhibitory concentration agai...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
117483,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,MIC,'=',0.5,ug.mL-1,,,F,In vitro minimum inhibitory concentration agai...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
157618,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,Activity,'=',0.98,,,,F,In vivo antibacterial activity against Staphyl...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett


In [27]:
nul = df[df['Standard Value'].isnull()].index
nul

Index([    52,     53,     54,     55,     56,     57,     58,     59,     60,
           61,
       ...
       130723, 130728, 130734, 130738, 130739, 130740, 130746, 130748, 130752,
       130754],
      dtype='int64', length=12282)

In [28]:
df.drop(nul, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(nul, axis=0, inplace=True)


Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal
0,CHEMBL146445,481.62,CC(C)c1csc(CCc2ccn3c(=O)c(/C=C/C(=O)O)c(N4CCCN...,MPC8,'=',128.0,ug ml-1,,,F,Minimum inhibitory concentration against Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Bioorg Med Chem Lett
1,CHEMBL146445,481.62,CC(C)c1csc(CCc2ccn3c(=O)c(/C=C/C(=O)O)c(N4CCCN...,MPC8,'=',128.0,ug ml-1,,,F,Minimum inhibitory concentration against Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Bioorg Med Chem Lett
2,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',2.0,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem
3,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',0.5,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem
4,CHEMBL63609,513.56,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',64.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118469,CHEMBL29,334.40,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@...,IZ,'=',23.0,mm,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Med Chem Res
118470,CHEMBL4,361.37,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,MIC,'=',50.0,ug.mL-1,,,F,Antibacterial activity against gentamicin-resi...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Med Chem Res
118471,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem
118472,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'>',32.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Bioorg Med Chem Lett


In [17]:
df['Standard Value'] = df['Standard Value'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Standard Value'] = df['Standard Value'].astype(float)


In [None]:
units = df['Standard Units'].unique() 
units

array(['Pseudomonas aeruginosa'], dtype=object)

In [None]:
['ug ml-1', 'ug.mL-1', 'nM', 'uM', 'umol/L', 'mM', 'M', 'mg/L']

In [22]:
for unit in units:
    print(unit, df[df['Standard Units']==unit].shape[0])

ug ml-1 2171
ug.mL-1 46281
% 44757
nM 10610
nan 0
mg.kg-1 226
uM 1130
mm 7747
mg kg-1 103
hr 242
/uM/s 63
ug 114
mm/mg 33
10'-2 umol/ml 256
/s 162
cm 16
um 12
ppm 27
log10CFU/ml 36
mg/ml 46
uL/ml 17
umol 8
umol/L 47
p.p.m. 14
mg/L 97
uM/ml 14
10'6/M/s 25
mM 65
/s/microM 30
10'-2micromol/ml 44
10'-2microM 44
/mM/s 17
nm 64
10'-2mmol/ml 63
10'-2umol 40
CFU/ml 38
deltalog10CFU/ml 9
deltalog10CFU 11
10'3CFU/ml 1
10'2CFU/ml 1
deltalog10CFU/ml.hr 5
10^3/ml 3
10^2umol/ml 24
fold 12
10'8CFU 28
ug cm**-3 2
/ml 4
microg 15
10'-5 ug/ml 2
10'-6 ug/ml 3
microg/cm3 9
umol/ml 24
mm2 16
mm3 24
um3/um2 3
10'3/ml 1
log10CFU 6
umol/cm3 2
/M/s 20
10'5/ml 2
10^1/mm2 1
/hr 4
10'-9No_unit 2
mg.min/L 3
%v/v 2
10^4/ml 2
degrees C 3
10'8CFU/ml 1
pmol 1
10'6CFU/ml 1
10^9CFU/ml 1
10'2 uM/s 1
10'6/ml 2
ml 1
10^3CFU/mg 2
10'7/M/s 1
nmol/min/mg 1
10'-7 ug/ml 3
10'-8 ug/ml 1
mg/kg/day 3
10'4/M/s 1
10^-8No_unit 1
10'7/ml 1
10^6/ml 1
10'2/ml 1


## 2. Target value standardize (Standard)

In [31]:
target = target_curate(data =df, target_name_col = 'Target Name', target_name ='Pseudomonas aeruginosa', 
                       target_org_col='Target Organism', target_org = 'Pseudomonas aeruginosa',
                            type_col='Standard Type', unit_col='Standard Units', active_col='Standard Value', 
                            relate_col='Standard Relation', type_arg ='MIC', MW='Molecular Weight', equal_only = True)
target.curated_fit()
df1 = target.df

Number of data before target curation: 118474


(115699, 16)

(115699, 16)

Number of data after handle organism and target name: 115699
Number of data after select unit: 50989
SELECTING ONLY EQUAL
Number of data after standardizing: 36014


In [32]:
df1

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL
0,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',2.0,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.348850
1,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',0.5,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.950910
2,CHEMBL63609,513.56,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',64.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,3.904411
3,CHEMBL304987,538.63,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',16.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,4.527171
4,CHEMBL65904,540.58,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',8.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,4.829770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36009,CHEMBL529,749.00,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,MIC,'=',16.0,ug.mL-1,,,F,Antibacterial activity against penicillin-susc...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,4.670362
36010,CHEMBL4854181,476.34,Cc1[nH]c(C(=O)Nc2nc3c(OCc4ccccc4)cc(C(=O)O)cc3...,MIC,'=',8000.0,nM,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.096910
36011,CHEMBL4,361.37,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,MIC,'=',50.0,ug.mL-1,,,F,Antibacterial activity against gentamicin-resi...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Med Chem Res,3.858982
36012,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.219257


In [33]:
# Xóa vô cực âm, vô cực dương
df1['pChEMBL'] = df1['pChEMBL'].replace([np.inf, -np.inf], np.nan)
df1.dropna(subset=['pChEMBL'], inplace=True)

# Kiểm tra xem có giá trị vô cực dương hay không
has_inf = np.any(df1== np.inf)
print("Có giá trị vô cực dương:", has_inf)

# Kiểm tra xem có giá trị vô cực âm hay không
has_neg_inf = np.any(df1== -np.inf)
print("Có giá trị vô cực âm:", has_neg_inf)
# Tìm giá trị min và max
min_value = df1['pChEMBL'].min()
max_value = df1['pChEMBL'].max()

# In kết quả
print(f"Giá trị min của df1['pChEMBL']: {min_value}")
print(f"Giá trị max của df1['pChEMBL']: {max_value}")

df1.shape

Có giá trị vô cực dương: False
Có giá trị vô cực âm: False
Giá trị min của df1['pChEMBL']: -0.8615610937326655
Giá trị max của df1['pChEMBL']: 9.80558014307975


(35903, 17)

## Assay

In [None]:
assay = df1['Assay Type'].unique()
for a in assay:
    print(a, df1[df1['Assay Type']==a].shape[0])

F 35810


In [None]:
df1['Assay Organism'].unique()

array(['Pseudomonas aeruginosa'], dtype=object)

In [37]:
assay = assay_curate(data=df1, type_col="Assay Type", org_col="Assay Organism",des_col='Assay Description', 
                     type_arg='F', org_arg='Pseudomonas aeruginosa', kw = '' )

assay.curated_fit()

Number of data befor standardizing: 35903
Number of data after choosing assay type: 35876
Number of data after choosing assay organism: 35810
Number of data after curating: 35810


In [39]:
df2 = assay.df
df2

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL
0,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',2.0,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.348850
1,CHEMBL56252,446.56,N=C(N)NCCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)...,MIC,'=',0.5,ug.mL-1,,,F,Minimum inhibitory concentration of Levofloxac...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.950910
2,CHEMBL63609,513.56,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',64.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,3.904411
3,CHEMBL304987,538.63,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',16.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,4.527171
4,CHEMBL65904,540.58,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',8.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,4.829770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36009,CHEMBL529,749.00,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,MIC,'=',16.0,ug.mL-1,,,F,Antibacterial activity against penicillin-susc...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,4.670362
36010,CHEMBL4854181,476.34,Cc1[nH]c(C(=O)Nc2nc3c(OCc4ccccc4)cc(C(=O)O)cc3...,MIC,'=',8000.0,nM,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.096910
36011,CHEMBL4,361.37,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,MIC,'=',50.0,ug.mL-1,,,F,Antibacterial activity against gentamicin-resi...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Med Chem Res,3.858982
36012,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.0,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,5.219257


In [43]:
# bỏ giá trị pChEMBL =0 và NaN
df3 = df2[df2['pChEMBL'] !=0]
df3.dropna(subset=['pChEMBL'], inplace=True)
df3.shape

(35810, 17)

In [44]:
df3.dropna(subset=['Smiles'], inplace=True)
df3.shape

(35800, 17)

In [45]:
df3[df3['Smiles']== None]

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL


## Smiles curation

In [46]:
smile = smile_curate(data=df3,smile_col='Smiles', pchem_col='pChEMBL', keep = 'best')
smile.curate()

(18598, 18)


In [47]:
df4 = smile.df
df4.shape


(18598, 18)

In [48]:
df4.dropna(subset=['Canonical_Smiles'], inplace=True)
df4.shape

(18598, 18)

In [49]:
df4['Standard Units'].unique()

array(['ug.mL-1', 'nM'], dtype=object)

## Save

In [50]:
df4.to_csv('Data/assay/PA_MIC_pchem.csv', index=False)

In [51]:
data = pd.read_csv('Data/assay/PA_MIC_pchem.csv')
data

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,Canonical_Smiles
0,CHEMBL127,383.47,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...,MIC,'=',6.000000e-05,ug.mL-1,,Outside typical range,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,9.805580,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...
1,CHEMBL4483807,384.55,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1,MIC,'=',3.000000e-01,nM,,Outside typical range,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,9.522879,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1
2,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.500000e-04,ug.mL-1,,Outside typical range,F,Antibacterial activity against wild type Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,9.122347,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
3,CHEMBL5315124,740.76,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...,MIC,'=',6.000000e-04,ug.mL-1,,,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,9.091526,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...
4,CHEMBL1668188,443.12,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl,MIC,'=',5.000000e-04,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,8.947551,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18593,CHEMBL542993,452.94,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,-0.537084,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18594,CHEMBL345633,430.51,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,-0.559141,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18595,CHEMBL347627,430.51,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,Compound was tested for in vitro antibacterial...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,-0.559141,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...
18596,CHEMBL347641,416.48,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,-0.573530,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...


In [52]:
data['Standard Value'].max()

160000000.0

In [53]:
smiles_col, id_col, activity_col = "Smiles", "Molecule ChEMBL ID", "pChEMBL"
thresh = 5

In [54]:
t1 = data[activity_col] < thresh 
data.loc[t1, activity_col] = 1
t2 = data[activity_col] >= thresh 
data.loc[t2, activity_col] = 0
data[activity_col] = data[activity_col].astype('int64')

In [55]:
data

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,Canonical_Smiles
0,CHEMBL127,383.47,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...,MIC,'=',6.000000e-05,ug.mL-1,,Outside typical range,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,0,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...
1,CHEMBL4483807,384.55,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1,MIC,'=',3.000000e-01,nM,,Outside typical range,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,0,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1
2,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.500000e-04,ug.mL-1,,Outside typical range,F,Antibacterial activity against wild type Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,0,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
3,CHEMBL5315124,740.76,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...,MIC,'=',6.000000e-04,ug.mL-1,,,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,0,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...
4,CHEMBL1668188,443.12,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl,MIC,'=',5.000000e-04,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,0,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18593,CHEMBL542993,452.94,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18594,CHEMBL345633,430.51,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18595,CHEMBL347627,430.51,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,Compound was tested for in vitro antibacterial...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...
18596,CHEMBL347641,416.48,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...


In [58]:
data.rename(columns={'Molecule ChEMBL ID': 'ID'}, inplace=True)
data

Unnamed: 0,ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,Canonical_Smiles
0,CHEMBL127,383.47,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...,MIC,'=',6.000000e-05,ug.mL-1,,Outside typical range,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,0,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...
1,CHEMBL4483807,384.55,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1,MIC,'=',3.000000e-01,nM,,Outside typical range,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,0,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1
2,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.500000e-04,ug.mL-1,,Outside typical range,F,Antibacterial activity against wild type Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,0,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
3,CHEMBL5315124,740.76,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...,MIC,'=',6.000000e-04,ug.mL-1,,,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,0,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...
4,CHEMBL1668188,443.12,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl,MIC,'=',5.000000e-04,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,0,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18593,CHEMBL542993,452.94,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18594,CHEMBL345633,430.51,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18595,CHEMBL347627,430.51,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,Compound was tested for in vitro antibacterial...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...
18596,CHEMBL347641,416.48,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...


In [59]:
data.to_csv('Data/assay/PA_MIC_pchem_class.csv', index=False)

# ProQSAR

In [60]:
import pandas as pd

inhouse = pd.read_csv('Data/assay/Inhouse_lib.csv')
inhouse

Unnamed: 0,ID,Smiles
0,L001,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
1,L002,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...
2,L003,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...
3,L004,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...
4,L005,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...
...,...,...
152,L153,C[C@@H](O)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
153,L154,N=C(N)NCCC[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
154,L155,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
155,L156,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=C(B...


In [61]:
pa = pd.read_csv('Data/assay/PA_MIC_pchem_class.csv')
pa

Unnamed: 0,ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,Canonical_Smiles
0,CHEMBL127,383.47,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...,MIC,'=',6.000000e-05,ug.mL-1,,Outside typical range,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,0,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...
1,CHEMBL4483807,384.55,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1,MIC,'=',3.000000e-01,nM,,Outside typical range,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,0,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1
2,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',2.500000e-04,ug.mL-1,,Outside typical range,F,Antibacterial activity against wild type Pseud...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,0,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
3,CHEMBL5315124,740.76,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...,MIC,'=',6.000000e-04,ug.mL-1,,,F,Antimicrobial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Antimicrob Agents Chemother,0,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...
4,CHEMBL1668188,443.12,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl,MIC,'=',5.000000e-04,ug.mL-1,,,F,Antibacterial activity against Pseudomonas aer...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Eur J Med Chem,0,CCCCc1ccc(CNC2CCCCC2NCc2ccc(CCCC)cc2)cc1.Cl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18593,CHEMBL542993,452.94,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18594,CHEMBL345633,430.51,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,CCC1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2N(C)...
18595,CHEMBL347627,430.51,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,Compound was tested for in vitro antibacterial...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...
18596,CHEMBL347641,416.48,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...,MIC,'=',1.560000e+06,ug.mL-1,,Outside typical range,F,In vitro minimum inhibitory concentration agai...,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,Pseudomonas aeruginosa,J Med Chem,1,C[C@@H]1CN(c2c(F)cc3c(=O)c(C(=O)O)c4scc5n4c3c2...


In [46]:
sa.rename(columns={'Molecule ChEMBL ID': 'ID'}, inplace=True)
sa.to_csv('Data/assay/SA_MIC_pchem_class.csv', index=False)

In [62]:
from ProQSAR.qsar import ProQSAR
from ProQSAR.Config.config import Config
from ProQSAR.Featurizer.feature_generator import FeatureGenerator
import matplotlib
matplotlib.use("Agg")

smiles_col, id_col, activity_col = "Smiles", "ID", "pChEMBL"
feature_type = FeatureGenerator.get_all_types()

config = Config(
    featurizer={"feature_types": feature_type},
    splitter={'test_size': 0.1, 'option': 'stratified_random'},
    optimizer={'n_trials': 100, 'deactivate': True}
)
qsar = ProQSAR(activity_col, id_col, smiles_col, n_jobs=4, n_splits=5, n_repeats=5, config=config, keep_all_test=True, keep_all_pred=True, project_name="PA_MIC_class_stratified_random")
qsar.run_all(data_dev=pa, data_pred=inhouse, alpha=[0.05, 0.1, 0.2])


  from .autonotebook import tqdm as notebook_tqdm
[21:52:42] Tautomer enumeration stopped at 160 tautomers: max transforms reached
[21:52:42] Tautomer enumeration stopped at 220 tautomers: max transforms reached
[21:52:42] Tautomer enumeration stopped at 299 tautomers: max transforms reached
[21:52:42] Tautomer enumeration stopped at 257 tautomers: max transforms reached
[21:52:43] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[21:52:43] Tautomer enumeration stopped at 182 tautomers: max transforms reached
[21:52:43] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[21:52:43] Tautomer enumeration stopped at 280 tautomers: max transforms reached
[21:52:43] Tautomer enumeration stopped at 207 tautomers: max transforms reached
[21:52:43] Tautomer enumeration stopped at 182 tautomers: max transforms reached
[21:52:43] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[21:52:44] Tautomer enumeration stopped at 979 tautomers: m

In [34]:
from ProQSAR.qsar import ProQSAR

smiles_col, id_col, activity_col = "Smiles", "ID", "pChEMBL"
qsar = ProQSAR(activity_col, id_col, smiles_col).load("Project/SA_MIC_class_stratified_random/proqsar.pkl")
qsar.__dict__

{'activity_col': 'pChEMBL',
 'id_col': 'ID',
 'smiles_col': 'Smiles',
 'mol_col': 'mol',
 'project_name': 'SA_MIC_class_stratified_random',
 'n_jobs': 4,
 'random_state': 42,
 'scoring_target': None,
 'scoring_list': None,
 'n_splits': 5,
 'n_repeats': 5,
 'keep_all_test': True,
 'keep_all_pred': True,
 'config': <ProQSAR.Config.config.Config at 0x7d395c6b63d0>,
 'save_dir': 'Project/SA_MIC_class_stratified_random',
 'logger': <RootLogger root (INFO)>,
 'shape_summary': {'FCFP6': {'Data': {'train': {'original': (40087, 4098),
     'duplicate': (36598, 4098),
     'missing': (36598, 4098),
     'lowvar': (36598, 4098),
     'univ_outlier': (36598, 4098),
     'kbin': (36598, 4098),
     'multiv_outlier': (28259, 4098),
     'rescaler': (28259, 4098),
     'feature_selector (ExtraTreesClassifier)': (28259, 905)}}}},
 'optimaldata': OptimalDataset(scoring_target=None, scoring_list=None, n_splits=5, n_repeats=5, save_cv_report=True, cv_report_name='cv_report_datasets', visualize=None, save

In [37]:
cv_data = pd.read_csv("Project/SA_MIC_class_stratified_random/cv_report_datasets.csv")
cv_data

Unnamed: 0,scoring,cv_cycle,ECFP2,ECFP4,ECFP6,FCFP2,FCFP4,FCFP6,MACCS,RDK5,RDK6,RDK7,avalon,mordred,pubchem,rdkdes
0,accuracy,1,0.846666,0.844427,0.849384,0.837810,0.848097,0.855272,0.814977,0.841863,0.847704,0.842098,0.841261,0.841589,0.832474,0.837003
1,accuracy,2,0.835764,0.839833,0.845049,0.844211,0.849601,0.852088,0.813976,0.837561,0.836620,0.841510,0.842246,0.834881,0.834218,0.835586
2,accuracy,3,0.836047,0.848306,0.844508,0.840853,0.839350,0.848195,0.811550,0.839045,0.839355,0.846357,0.843794,0.838880,0.831860,0.850521
3,accuracy,4,0.841971,0.841313,0.842341,0.843189,0.843562,0.854388,0.820608,0.837413,0.841946,0.842538,0.855615,0.843008,0.837165,0.840093
4,accuracy,5,0.843670,0.846423,0.856698,0.840415,0.848827,0.848699,0.819799,0.837834,0.842666,0.837397,0.842505,0.831506,0.832597,0.836724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,roc_auc,24,0.922919,0.921098,0.924770,0.918660,0.918077,0.922575,0.888380,0.917840,0.919421,0.914126,0.914558,0.912403,0.909497,0.913900
248,roc_auc,25,0.915656,0.922183,0.928340,0.911612,0.924896,0.930977,0.891119,0.917782,0.913469,0.913805,0.916970,0.911665,0.907443,0.913308
249,roc_auc,mean,0.918076,0.920511,0.923955,0.916989,0.920422,0.926036,0.891206,0.916054,0.916541,0.917465,0.918479,0.912597,0.908767,0.914733
250,roc_auc,median,0.917546,0.920664,0.924508,0.917416,0.920541,0.926041,0.891886,0.916120,0.917001,0.916337,0.918193,0.912792,0.908366,0.914473


In [38]:
qsar.optimaldata.report = cv_data
qsar.analysis()

  figure, axes = plt.subplots(
  max_fold_diff = variances_by_method.max() / variances_by_method.min()
  max_fold_diff = variances_by_method.max() / variances_by_method.min()
  max_fold_diff = variances_by_method.max() / variances_by_method.min()


In [33]:
a = pd.read_csv('Project/SA_MIC_class_stratified_random/cv_report_model.csv')
a[a['scoring'] == 'f1']
#a

Unnamed: 0,scoring,cv_cycle,AdaBoostClassifier,CatBoostClassifier,DummyClassifier,ExtraTreesClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,MLPClassifier,RandomForestClassifier,SVC,XGBClassifier
56,f1,1,0.764342,0.839368,0.68665,0.859963,0.771126,0.850259,0.796379,0.844306,0.860157,0.848383,0.845681
57,f1,2,0.764394,0.838458,0.686498,0.859744,0.77338,0.852591,0.798983,0.847452,0.861088,0.84466,0.844994
58,f1,3,0.741391,0.832342,0.686498,0.859415,0.762235,0.849958,0.792072,0.848567,0.857337,0.842818,0.841227
59,f1,4,0.762637,0.83998,0.686498,0.860567,0.778987,0.855511,0.801782,0.850571,0.861964,0.84931,0.843054
60,f1,5,0.756719,0.83637,0.686578,0.859375,0.770368,0.850571,0.797428,0.847429,0.860007,0.850168,0.841675
61,f1,6,0.756597,0.839283,0.68665,0.863652,0.770414,0.850598,0.793157,0.846103,0.864185,0.848964,0.845269
62,f1,7,0.757931,0.836222,0.686498,0.855539,0.769124,0.845793,0.8,0.851609,0.857434,0.846944,0.833561
63,f1,8,0.759884,0.8423,0.686498,0.860696,0.775098,0.861088,0.801543,0.847019,0.860453,0.849649,0.849352
64,f1,9,0.756186,0.831464,0.686498,0.853166,0.762756,0.849132,0.791239,0.842653,0.854234,0.843739,0.834126
65,f1,10,0.769918,0.840684,0.686578,0.86309,0.776171,0.856999,0.8,0.855316,0.865727,0.850532,0.845865


In [68]:
data_pred = pd.read_csv('Project/PA_MIC_class_stratified_random/PredResult/test_pred.csv')
data_pred

Unnamed: 0,ID,pChEMBL,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain
0,CHEMBL2236662,1,1,[1],[1],[1],in
1,CHEMBL4293731,1,1,[0 1],[0 1],[],in
2,CHEMBL327326,0,0,[0],[0],[0],in
3,CHEMBL1230182,0,0,[0 1],[0],[0],in
4,CHEMBL283387,0,0,[0],[0],[0],in
...,...,...,...,...,...,...,...
1855,CHEMBL2418389,0,0,[0 1],[0],[0],in
1856,CHEMBL135630,0,0,[0],[0],[0],in
1857,CHEMBL693,1,1,[0 1],[1],[1],in
1858,CHEMBL1644459,1,1,[1],[1],[1],in


In [69]:
data_pred['Predicted value'].unique()

array([1, 0])

In [64]:
pred_data = data_pred.merge(inhouse, on='ID', how='left')
pred_data

Unnamed: 0,ID,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain,Smiles
0,L001,1,[1],[1],[1],in,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
1,L002,1,[1],[1],[1],in,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...
2,L003,1,[1],[1],[1],in,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...
3,L004,1,[1],[1],[1],in,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...
4,L005,1,[1],[1],[1],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...
...,...,...,...,...,...,...,...
152,L153,1,[1],[1],[1],in,C[C@@H](O)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
153,L154,1,[1],[1],[1],in,N=C(N)NCCC[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
154,L155,1,[1],[1],[1],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
155,L156,1,[1],[1],[1],in,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=C(B...


In [13]:
a5 = pred_data.loc[1, 'Prediction Interval (alpha=0.05)']
a5

'[3.572 6.338]'

In [10]:
import pandas as pd
import numpy as np

values = np.array(a5.strip('[]').split(), dtype=float)
values

AttributeError: 'numpy.ndarray' object has no attribute 'strip'

In [45]:
b = 10**(values) * 1e3
#b.sort()
b

array([2.80543364e+06, 9.88553095e+08])

In [23]:
from copy import deepcopy

df = deepcopy(pred_data)

In [14]:
import pandas as pd
import numpy as np

# Sample DataFrame


# Function to convert pChEMBL to mM
def pchembl_to_mM(value):
    return 10**(-value) * 1e3

# Apply conversion to Predicted value column
df['Predicted value (mM)'] = df['Predicted value'].apply(pchembl_to_mM)

# Convert Prediction Intervals from string to actual values
def convert_interval(interval_str):
    values = np.array(interval_str.strip('[]').split(), dtype=float)
    #values=interval_str
    return np.sort(pchembl_to_mM(values)).round(4)

df['Prediction Interval (alpha=0.05) (mM)'] = df['Prediction Interval (alpha=0.05)'].apply(convert_interval)
df['Prediction Interval (alpha=0.1) (mM)'] = df['Prediction Interval (alpha=0.1)'].apply(convert_interval)
df['Prediction Interval (alpha=0.2) (mM)'] = df['Prediction Interval (alpha=0.2)'].apply(convert_interval)

# Print the updated DataFrame
df

Unnamed: 0,ID,Predicted value,Prediction Interval (alpha=0.05),Prediction Interval (alpha=0.1),Prediction Interval (alpha=0.2),Applicability domain,Smiles,Predicted value (mM),Prediction Interval (alpha=0.05) (mM),Prediction Interval (alpha=0.1) (mM),Prediction Interval (alpha=0.2) (mM)
0,L001,4.759679,[3.666 6.412],[3.971 6.084],[4.255 5.791],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...,0.017391,"[0.0004, 0.2158]","[0.0008, 0.1069]","[0.0016, 0.0556]"
1,L002,4.866846,[3.572 6.338],[3.872 6.028],[4.153 5.721],in,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...,0.013588,"[0.0005, 0.2679]","[0.0009, 0.1343]","[0.0019, 0.0703]"
2,L003,4.845051,[3.576 6.289],[3.877 5.971],[4.154 5.676],in,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...,0.014287,"[0.0005, 0.2655]","[0.0011, 0.1327]","[0.0021, 0.0701]"
3,L004,4.823231,[3.568 6.348],[3.862 6.043],[4.147 5.738],in,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...,0.015023,"[0.0004, 0.2704]","[0.0009, 0.1374]","[0.0018, 0.0713]"
4,L005,4.783527,[3.598 6.413],[3.911 6.115],[4.201 5.808],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...,0.016462,"[0.0004, 0.2523]","[0.0008, 0.1227]","[0.0016, 0.063]"
...,...,...,...,...,...,...,...,...,...,...,...
152,L153,5.286132,[3.809 6.541],[4.128 6.248],[4.421 5.945],in,C[C@@H](O)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...,0.005174,"[0.0003, 0.1552]","[0.0006, 0.0745]","[0.0011, 0.0379]"
153,L154,5.238542,[3.938 6.657],[4.233 6.374],[4.514 6.081],in,N=C(N)NCCC[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...,0.005774,"[0.0002, 0.1153]","[0.0004, 0.0585]","[0.0008, 0.0306]"
154,L155,5.544348,[4.097 6.786],[4.386 6.482],[4.679 6.197],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...,0.002855,"[0.0002, 0.08]","[0.0003, 0.0411]","[0.0006, 0.0209]"
155,L156,5.411586,[4.086 6.779],[4.385 6.478],[4.669 6.179],in,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=C(B...,0.003876,"[0.0002, 0.082]","[0.0003, 0.0412]","[0.0007, 0.0214]"


In [67]:
pred_data.to_csv('Project/PA_MIC_class_stratified_random/PredResult/data_pred_inhouse_mM.csv', index=False)

In [25]:
pd.read_csv('Project/SA_MIC_class_stratified_random/PredResult/data_pred_inhouse_mM.csv')

Unnamed: 0,ID,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain,Smiles
0,L001,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
1,L002,1,[0 1],[0 1],[],out,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...
2,L003,1,[0 1],[0 1],[],out,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...
3,L004,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...
4,L005,0,[0 1],[0 1],[],out,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...
...,...,...,...,...,...,...,...
152,L153,0,[0 1],[0 1],[],in,C[C@@H](O)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
153,L154,1,[0 1],[0 1],[],out,N=C(N)NCCC[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
154,L155,0,[0 1],[0 1],[],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...
155,L156,1,[0 1],[0 1],[],out,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=C(B...
