# codebase

In [25]:
import numpy as np
import pandas as pd
import seaborn as sns


class target_curate:
    def __init__(self, data, target_name_col, target_name, target_org_col, target_org,
                 type_col, unit_col, active_col, relate_col, type_arg, MW, equal_only = False, thresh = 7):
        self.data = data.reset_index(drop=True)
        self.target_name_col = target_name_col 
        self.target_name = target_name
        self.target_org_col=target_org_col
        self.target_org = target_org
        self.type_col = type_col
        self.unit_col = unit_col
        self.active_col = active_col
        self.relate_col = relate_col
        self.type_arg = type_arg
        self.equal_only = equal_only
        self.thresh = thresh
        self.MW = MW
        
    def target_filter(self, data, target_name_col, target_name, target_org_col, target_org):
        df= data[data[target_name_col]==target_name]
        display(df.shape)
        df2 = df[df[target_org_col]==target_org]
        display(df2.shape)
        return df2
        
    def standardize_value(self, data, type_col, type_arg, unit_col):
        df = data[data[type_col]==type_arg]
        df= df.dropna(subset =unit_col)
        df.reset_index(drop=True, inplace = True)
        type = ['nM', 'uM', 'mM']
        idx = []
        for key, value in enumerate(df[unit_col]):
            if value in type:
                idx.append(key)
        df = df.iloc[idx,:]
        return df
    
    def convert_activity(self, data, active_col, unit_col, MW_col):
        df = data.copy()
        df['pChEMBL'] = np.zeros(len(df))
        #unit = df['Unit'].unique()

        for key, value in enumerate(df[unit_col]):
            conc_value = df.loc[key, active_col]
            mw_value   = df.loc[key, MW_col]
            if value == 'μM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-6)
            elif value  == 'uM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-6)
            elif value  == 'nM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-9)
            elif value  == 'nmol/l':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-9)
            elif value  == 'mM':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-3)
            elif value  == 'M':
                df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1)
            elif value.lower() in ('µg/ml', 'ug/ml', 'ug.ml-1'):
                # µg/mL → g/L: conc_value * 1e-3
                #       → mol/L: (conc_value * 1e-3) / MW
                if mw_value <= 0:
                    # Nếu MW không hợp lệ (<=0), không thể tính pChEMBL
                    df.loc[key, 'pChEMBL'] = np.nan
                else:
                    conc_molar = (conc_value * 1e-3) / mw_value
                    # Tránh log của giá trị <= 0
                    if conc_molar > 0:
                        df.loc[key, 'pChEMBL'] = -np.log10(conc_molar)
                    else:
                        df.loc[key, 'pChEMBL'] = np.nan
                        
            elif value  == 'no unit':
                df.loc[key, 'pChEMBL'] = -df.loc[key, active_col]
        return df
    
    
    def standardize_relation(self, data,relate_col,  equal_only, thresh):
        df = data.copy()
        df.dropna(subset = relate_col, inplace = True)
        if equal_only == True:
            print('SELECTING ONLY EQUAL')
            df = df[df[relate_col]=="'='"]
            
        else:
            print('HANDLING')
            df_big = df[(df[relate_col] == "'>'") | (df[relate_col] == "'>='")]
            df_small = df[(df[relate_col] == "'<'") | (df[relate_col] == "'<='")]
            df_equal = df[df[relate_col]=="'='"]
                
            #Drop pCHEMBL < thresh for df_big
            drop_idx = df_big[df_big["pChEMBL"] < thresh].index
            df_big.drop(drop_idx, inplace = True)
                
            #Drop pCHEMBL > thresh for df_small
            drop_idx = df_small[df_small["pChEMBL"] > thresh].index
            df_small.drop(drop_idx, inplace = True)
                
            df = pd.concat((df_equal, df_small, df_big), axis = 0)
        return df
                  
    def curated_fit(self):
        print("Number of data before target curation:", self.data.shape[0])
        df = self.target_filter(data = self.data, target_name_col = self.target_name_col, target_name =self.target_name, 
                           target_org_col=self.target_org_col, target_org = self.target_org)
        df.reset_index(drop=True, inplace = True)
        print("Number of data after handle organism and target name:", df.shape[0])
        df1 = self.standardize_value(data=df, type_col=self.type_col, type_arg=self.type_arg, unit_col=self.unit_col)
        df1.reset_index(drop=True, inplace = True)
        print("Number of data after select unit:", df1.shape[0])
        #display(df1.head(5))
        df2 = self.convert_activity(data=df1, active_col=self.active_col, unit_col = self.unit_col, MW_col=self.MW)
        df2.reset_index(drop=True, inplace = True)
        #display(df2.head(5))
        df3 = self.standardize_relation(data=df2,relate_col=self.relate_col,  equal_only=self.equal_only, thresh=self.thresh)
        self.df = df3
        self.df.reset_index(drop=True, inplace = True)
        print("Number of data after standardizing:", self.df.shape[0])
        

In [3]:
from rdkit import Chem

class smile_curate:
    def __init__(self, data, smile_col, pchem_col, keep = 'best'):
        self.data = data
        self.smile_col = smile_col
        self.pchem_col = pchem_col
        self.keep = keep
    def smile_norm(self, data, smile_col):
        df = data.dropna(subset = smile_col)
        def safe_canonical(smiles):
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol is None:
                    # Nếu RDKit không parse được, trả NaN
                    return np.nan
                # MolToSmiles với useChiral=True để lưu thông tin stereo nếu có
                return Chem.MolToSmiles(mol, isomericSmiles=True)
            except Exception:
                # Bất kỳ lỗi nào khác cũng trả NaN
                return np.nan

        # Áp dụng hàm safe_canonical lên từng dòng
        df['Canonical_Smiles'] = df[smile_col].apply(safe_canonical)
        
        return df

    def curate(self):
        df = self.smile_norm(data=self.data, smile_col=self.smile_col)
        if self.keep == 'best':
            df = df.sort_values(by=self.pchem_col, ascending=False)
            df_dropdup = df.drop_duplicates(subset=['Canonical_Smiles'], keep="first")
        elif self.keep == 'worst':
            df = df.sort_values(by=self.pchem_col, ascending=True)
            df_dropdup = df.drop_duplicates(subset=['Canonical_Smiles'], keep="first")
           
        print(df_dropdup.shape)
        self.df = df_dropdup

In [4]:
class assay_curate:
    def __init__(self,data, type_col, org_col,des_col, type_arg='F', org_arg='Homo sapiens', kw = 'MTT'):
        self.data = data
        self.type_col = type_col
        self.org_col = org_col
        self.des_col = des_col
        self.type_arg= type_arg
        self.org_arg= org_arg
        self.kw = kw 
    
    def search_kw(self, data,kw, des_col):
        index = []
        for key, value in enumerate(data[des_col]):
            if kw in value:
                index.append(key)
        return data.iloc[index,:]
    
    def curated_fit(self):
        print("Number of data befor standardizing:", self.data.shape[0])
        df = self.data[self.data[self.type_col]==self.type_arg]
        print("Number of data after choosing assay type:", df.shape[0])
        df2 = df[df[self.org_col]==self.org_arg]
        print("Number of data after choosing assay organism:", df2.shape[0])
        df3 = self.search_kw(data=df2, kw = self.kw, des_col = self.des_col)
        print("Number of data after curating:", df3.shape[0])
        self.df = df3

# Data curation

## 1. Load data

In [26]:
data = pd.read_csv('Data/assay/SA.csv', sep=';')
data.head(5)

  data = pd.read_csv('Data/assay/SA.csv', sep=';')


Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
0,CHEMBL279571,,,1083.36,,,TS-30663,CCCCCCCCCCCCO[C@H](COP(=O)(O)OC1OC(C(N)=O)C(O)...,MIC,'=',...,CHEMBL1132768,1,Scientific Literature,Bioorg Med Chem Lett,2000.0,,,,,6.25
1,CHEMBL301400,,,308.38,0.0,4.38,24,CCCCCCCCCC(=O)Nc1cc([N+](=O)[O-])ccc1O,Activity,,...,CHEMBL1122485,1,Scientific Literature,J Med Chem,1983.0,,,,,
2,CHEMBL301400,,,308.38,0.0,4.38,24,CCCCCCCCCC(=O)Nc1cc([N+](=O)[O-])ccc1O,Activity,,...,CHEMBL1122485,1,Scientific Literature,J Med Chem,1983.0,,,,,
3,CHEMBL264617,,,514.55,2.0,-0.79,42,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,ED50,'<',...,CHEMBL1124865,1,Scientific Literature,J Med Chem,1990.0,,,,,0.6
4,CHEMBL2367884,,,1864.67,,,Iw,CCCCCCCCCC(=O)N[C@@H]1[C@H](Oc2c3cc4cc2Oc2ccc(...,MIC,'=',...,CHEMBL1124384,1,Scientific Literature,J Med Chem,1989.0,,,,,0.5


In [27]:
columns = ['Molecule ChEMBL ID', 'Molecular Weight', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units', 'pChEMBL Value', 'Data Validity Comment',
          'Assay Type', 'Assay Description','Assay Organism', 'Assay Variant Mutation','Target Name','Target Organism', 'Document Journal']

df = data[columns]
df.head(5)

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal
0,CHEMBL279571,1083.36,CCCCCCCCCCCCO[C@H](COP(=O)(O)OC1OC(C(N)=O)C(O)...,MIC,'=',6.25,ug.mL-1,,,F,Compound was evaluated for its antibacterial a...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
1,CHEMBL301400,308.38,CCCCCCCCCC(=O)Nc1cc([N+](=O)[O-])ccc1O,Activity,,,,,,F,Bacteriostatic activity against Staphylococcus...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem
2,CHEMBL301400,308.38,CCCCCCCCCC(=O)Nc1cc([N+](=O)[O-])ccc1O,Activity,,,,,,F,Bactericidal activity against Staphylococcus a...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem
3,CHEMBL264617,514.55,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,ED50,'<',0.6,mg.kg-1,,,F,In vivo efficacy against Staphylococcus aureus...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem
4,CHEMBL2367884,1864.67,CCCCCCCCCC(=O)N[C@@H]1[C@H](Oc2c3cc4cc2Oc2ccc(...,MIC,'=',0.5,ug.mL-1,,,F,Minimal inhibitory concentration of compound a...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem


In [28]:
df[df['Molecule ChEMBL ID']=='CHEMBL131854']

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal
8,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,MIC,'=',0.5,ug.mL-1,,,F,In vitro minimum inhibitory concentration agai...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
39430,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,MIC,'=',0.25,ug.mL-1,,,F,In vitro minimum inhibitory concentration agai...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
117483,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,MIC,'=',0.5,ug.mL-1,,,F,In vitro minimum inhibitory concentration agai...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
157618,CHEMBL131854,465.41,O=C([C@@H](O)CO)N1CC=C(c2c(F)cc(N3C[C@H](COc4c...,Activity,'=',0.98,,,,F,In vivo antibacterial activity against Staphyl...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett


In [29]:
nul = df[df['Standard Value'].isnull()].index

In [30]:
df.drop(nul, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(nul, axis=0, inplace=True)


Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal
0,CHEMBL279571,1083.36,CCCCCCCCCCCCO[C@H](COP(=O)(O)OC1OC(C(N)=O)C(O)...,MIC,'=',6.25,ug.mL-1,,,F,Compound was evaluated for its antibacterial a...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
1,CHEMBL264617,514.55,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,ED50,'<',0.60,mg.kg-1,,,F,In vivo efficacy against Staphylococcus aureus...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem
2,CHEMBL2367884,1864.67,CCCCCCCCCC(=O)N[C@@H]1[C@H](Oc2c3cc4cc2Oc2ccc(...,MIC,'=',0.50,ug.mL-1,,,F,Minimal inhibitory concentration of compound a...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem
3,CHEMBL42305,646.67,NCC[C@@H]1NC(=O)CCNC(=O)c2cc(NC(=O)Cn3cnc4c(O)...,MIC,'>',100000.00,nM,,,F,Minimum inhibitory concentration (MIC) against...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
4,CHEMBL65904,540.58,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[n...,MIC,'=',1.00,ug.mL-1,,,F,Antibacterial activity against penicillin G su...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199324,CHEMBL1649722,1620.69,CCCCCCCCCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N...,MIC,'=',1.00,ug.mL-1,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Nat Prod
199325,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',0.25,ug.mL-1,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett
199326,CHEMBL3287379,408.34,O=C1C=CN(c2c(F)cc(N3C[C@H](CNc4ccon4)OC3=O)c(F...,ED50,'=',4.70,mg.kg-1,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,ACS Med Chem Lett
199327,CHEMBL9,319.34,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21,FC,'=',128.00,,,,F,Induction of resistance in Staphylococcus aure...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem


In [31]:
df['Standard Value'] = df['Standard Value'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Standard Value'] = df['Standard Value'].astype(float)


In [None]:
df['Standard Units'].unique() 

array(['ug.mL-1', 'mg.kg-1', 'nM', 'mg kg-1', nan, 'uM', 'mm', 'ug ml-1',
       '%', 'p.p.m.', 'mM', 'hr', 'mg/L', "10'-3uM/ml", 'ug',
       "10'-2 umol/ml", 'ug cm**-2', 'mg/kg/day', 'U ml-1', 'mM l-1',
       'ug mg-1', 'cm', 'cm2', 'umol/L', 'mg kg-1 day-1', 'mg Kg-1',
       'ng.hr.mL-1', 'nmol/mg', 'ppm', 'ng/mg', 'uM/ml', 'nm', 'a.u.',
       "10'-2micromol/ml", 'radii mm-1', 'log10CFU/ml', 'fold', 'umol/g',
       'ng', 'mm3', "10'4/ml", 'ug/sq.cm', "10'-2umol", 'mg/ml', 'mm2',
       'log10CFU', "10'3CFU/ml", 'cps', 'deltalog10CFU', 'CFU', 'ug kg-1',
       "10'10CFU", 'ug/g', 'CFU/ml', 'ug m1 l-1', 'mm/mg', 'mmol/ml',
       "10'3/ml", 'mg mouse-1', 'microg', 'deltalog10CFU/ml', 'um',
       'umol/ml', 'millimeter/mg/ml', 'CFU/g', "10'-3micromol/ml", 'mg',
       '10^2umol/ml', "10'6CFU", "10'-2microM", 'log10CFU/ml.hr', 'day',
       'log10CFU/g', 'umol/Kg', 'uL/ml', 'mg 2kg-1', 'microg/cm3',
       'ug/disk', '10^-3mM', "10'-2mmol/ml", '10^2CFU/ml', 'nmol',
       "10'9CFU

In [39]:
df[df['Standard Units']== 'uM']['Standard Type'].unique()

array(['MBC', 'MGC', 'Activity', 'MIC100', 'MBIC', 'MBIC90', 'MBEC',
       'MBC99.9', 'MIC99', 'IC80', 'LD', 'ED50', 'MBC>99.9', 'MIC95',
       'MBIC50', 'MIC75', 'MIC=>90', 'MIC=>95', 'MIC>90', 'MITC95', 'MEC',
       'MIC82', 'BIC', 'LD90', 'LD50', 'MBEC50', 'MBEC90', 'MBC=>99.9',
       'MBC50', 'MBC90', 'MIC=>80', 'MI50', 'INH'], dtype=object)

## 2. Target value standardize (Standard)

In [32]:
target = target_curate(data =df, target_name_col = 'Target Name', target_name ='Staphylococcus aureus', 
                       target_org_col='Target Organism', target_org = 'Staphylococcus aureus',
                            type_col='Standard Type', unit_col='Standard Units', active_col='Standard Value', 
                            relate_col='Standard Relation', type_arg ='MIC', MW='Molecular Weight',equal_only = True)
target.curated_fit()
df1 = target.df

Number of data before target curation: 199329


(199329, 16)

(199329, 16)

Number of data after handle organism and target name: 199329
Number of data after select unit: 17059
SELECTING ONLY EQUAL
Number of data after standardizing: 12892


  df.loc[key, 'pChEMBL'] = -np.log10(df.loc[key, active_col]*1e-9)


In [41]:
df1

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL
0,CHEMBL308762,400.39,Nc1ccc(-n2cc(C(=O)O)c(=O)c3cc(F)c(N4CCNCC4)cc3...,MIC,'=',970.0,nM,,,F,Antibacterial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,6.013228
1,CHEMBL478538,391.56,O=C(/C=C/c1ccc(OCCCCCCN2CCCCC2)cc1)c1ccccc1,MIC,'=',256000.0,nM,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,3.591760
2,CHEMBL195732,455.34,O=C1C(CCOCc2ccccc2)C(=O)N(c2ccc(Cl)cc2)N1c1ccc...,MIC,'=',200000.0,nM,,,F,Minimum inhibitory concentration against cell ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,3.698970
3,CHEMBL1824040,479.54,CC(=O)c1ccc2c(c1)N(CCN1CCC(NCc3ccc4c(n3)NC(=O)...,MIC,'=',1000.0,nM,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,6.000000
4,CHEMBL229361,509.71,CCCCCCCCCCCCCCCCCCOCC(O)COP(=O)([O-])OCC[N+](C...,MIC,'=',22000.0,nM,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem,4.657577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12887,CHEMBL1082,365.41,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,MIC,'=',17000.0,nM,,,F,Antibacterial activity against gentamicin-and ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,4.769551
12888,CHEMBL4286774,686.81,COc1cc(/C=C/c2cc(/C=C/c3ccc(OCC(=O)N[C@@H](C)c...,MIC,'=',10000.0,nM,,,F,Antimicrobial activity against Methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,5.000000
12889,CHEMBL8,331.35,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MIC,'=',3010.0,nM,,,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,RSC Med Chem,5.521434
12890,CHEMBL4445375,552.62,COc1ccc(CCC(=O)c2c(O)cc(O)c3c2OC(c2ccc(O)cc2)C...,MIC,'=',25000.0,nM,,,F,Antibacterial activity against methicillin res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem,4.602060


In [42]:
# Xóa vô cực âm, vô cực dương
df1['pChEMBL'] = df1['pChEMBL'].replace([np.inf, -np.inf], np.nan)
df1.dropna(subset=['pChEMBL'], inplace=True)

# Kiểm tra xem có giá trị vô cực dương hay không
has_inf = np.any(df1== np.inf)
print("Có giá trị vô cực dương:", has_inf)

# Kiểm tra xem có giá trị vô cực âm hay không
has_neg_inf = np.any(df1== -np.inf)
print("Có giá trị vô cực âm:", has_neg_inf)
# Tìm giá trị min và max
min_value = df1['pChEMBL'].min()
max_value = df1['pChEMBL'].max()

# In kết quả
print(f"Giá trị min của df1['pChEMBL']: {min_value}")
print(f"Giá trị max của df1['pChEMBL']: {max_value}")

df1.shape

Có giá trị vô cực dương: False
Có giá trị vô cực âm: False
Giá trị min của df1['pChEMBL']: -0.8388490907372553
Giá trị max của df1['pChEMBL']: 11.698970004336019


(12891, 17)

## Assay

In [43]:
df1['Assay Type'].unique()

array(['F', 'A'], dtype=object)

In [44]:
df1['Assay Organism'].unique()

array(['Staphylococcus aureus'], dtype=object)

In [45]:
assay = assay_curate(data=df1, type_col="Assay Type", org_col="Assay Organism",des_col='Assay Description', 
                     type_arg='F', org_arg='Staphylococcus aureus', kw = '' )

assay.curated_fit()

Number of data befor standardizing: 12891
Number of data after choosing assay type: 12862
Number of data after choosing assay organism: 12862
Number of data after curating: 12862


In [46]:
df2 = assay.df
df2.shape

(12862, 17)

In [47]:
# bỏ giá trị pChEMBL =0 và NaN
df3 = df2[df2['pChEMBL'] !=0]
df3.dropna(subset=['pChEMBL'], inplace=True)
df3.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.dropna(subset=['pChEMBL'], inplace=True)


(12857, 17)

In [48]:
df3.dropna(subset=['Smiles'], inplace=True)
df3.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.dropna(subset=['Smiles'], inplace=True)


(12850, 17)

In [49]:
df3[df3['Smiles']== None]

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL


## Smiles curation

In [50]:
smile = smile_curate(data=df3,smile_col='Smiles', pchem_col='pChEMBL', keep = 'best')
smile.curate()

(5986, 18)


In [51]:
df4 = smile.df
df4.shape


(5986, 18)

In [53]:
df4.dropna(subset=['Canonical_Smiles'], inplace=True)
df4.shape

(5986, 18)

In [55]:
df4['Standard Units'].unique()

array(['nM'], dtype=object)

## Save

In [56]:
df4.to_csv('Data/assay/SA_MIC_nM_pchem.csv', index=False)

In [64]:
data = pd.read_csv('Data/assay/SA_MIC_nM_pchem.csv')
data

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,Canonical_Smiles
0,CHEMBL262242,658.80,COc1cccc(C(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O...,MIC,'=',2.000000e-03,nM,,Outside typical range,F,Antibacterial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,11.698970,COc1cccc(C(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O...
1,CHEMBL4562804,809.87,Cc1ccc(NC(=O)N[C@@H](Cc2cc(F)cc(F)c2)C(=O)N[C@...,MIC,'=',2.000000e-01,nM,,Outside typical range,F,Antimicrobial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,,9.698970,Cc1ccc(NC(=O)N[C@@H](Cc2cc(F)cc(F)c2)C(=O)N[C@...
2,CHEMBL4483807,384.55,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1,MIC,'=',3.000000e-01,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,9.522879,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1
3,CHEMBL5271595,534.93,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4ccnc4...,MIC,'=',3.900000e-01,nM,,Outside typical range,F,Antimicrobial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,9.408935,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4ccnc4...
4,CHEMBL5291144,534.93,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4cnc([...,MIC,'=',3.900000e-01,nM,,Outside typical range,F,Antimicrobial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,9.408935,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4cnc([...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,CHEMBL2019058,514.53,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...,MIC,'=',1.600000e+08,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0.795880,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...
5982,CHEMBL364713,413.43,COc1ccc2c(c1OC)C(=O)O[C@@H]2[C@H]1c2c(cc3c(c2O...,MIC,'=',2.070000e+08,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0.684030,COc1ccc2c(c1OC)C(=O)O[C@@H]2[C@H]1c2c(cc3c(c2O...
5983,CHEMBL2018971,532.98,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...,MIC,'=',3.200000e+08,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0.494850,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...
5984,CHEMBL3215522,1539.57,C=C(C/C=C(\C)CCC=C(C)C)CCC(C)(C)/C=C/CC/C(C)=C...,MIC,'=',2.700000e+09,nM,,Outside typical range,F,The compound was evaluated for its inhibitory ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,-0.431364,C=C(C/C=C(\C)CCC=C(C)C)CCC(C)(C)/C=C/CC/C(C)=C...


In [None]:
data['Standard Value'].max()

6900000000.0

In [66]:
smiles_col, id_col, activity_col = "Smiles", "Molecule ChEMBL ID", "pChEMBL"
thresh = 5

In [67]:
t1 = data[activity_col] < thresh 
data.loc[t1, activity_col] = 1
t2 = data[activity_col] >= thresh 
data.loc[t2, activity_col] = 0
data[activity_col] = data[activity_col].astype('int64')

In [68]:
data

Unnamed: 0,Molecule ChEMBL ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,Canonical_Smiles
0,CHEMBL262242,658.80,COc1cccc(C(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O...,MIC,'=',2.000000e-03,nM,,Outside typical range,F,Antibacterial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,0,COc1cccc(C(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O...
1,CHEMBL4562804,809.87,Cc1ccc(NC(=O)N[C@@H](Cc2cc(F)cc(F)c2)C(=O)N[C@...,MIC,'=',2.000000e-01,nM,,Outside typical range,F,Antimicrobial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,,0,Cc1ccc(NC(=O)N[C@@H](Cc2cc(F)cc(F)c2)C(=O)N[C@...
2,CHEMBL4483807,384.55,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1,MIC,'=',3.000000e-01,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,0,Cc1ccc2c(c1)-c1c(ssc1=S)C(C)(C)N2C(=O)c1cccnc1
3,CHEMBL5271595,534.93,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4ccnc4...,MIC,'=',3.900000e-01,nM,,Outside typical range,F,Antimicrobial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,0,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4ccnc4...
4,CHEMBL5291144,534.93,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4cnc([...,MIC,'=',3.900000e-01,nM,,Outside typical range,F,Antimicrobial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Eur J Med Chem,0,O=C(O)c1cn(C2CC2)c2c(Cl)c(N3CCC(NCC(O)Cn4cnc([...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5981,CHEMBL2019058,514.53,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...,MIC,'=',1.600000e+08,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,1,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...
5982,CHEMBL364713,413.43,COc1ccc2c(c1OC)C(=O)O[C@@H]2[C@H]1c2c(cc3c(c2O...,MIC,'=',2.070000e+08,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,1,COc1ccc2c(c1OC)C(=O)O[C@@H]2[C@H]1c2c(cc3c(c2O...
5983,CHEMBL2018971,532.98,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...,MIC,'=',3.200000e+08,nM,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,1,COc1cc(/C=C/C(=O)C2=C(/C=C/c3ccc(O)c(OC)c3)NC(...
5984,CHEMBL3215522,1539.57,C=C(C/C=C(\C)CCC=C(C)C)CCC(C)(C)/C=C/CC/C(C)=C...,MIC,'=',2.700000e+09,nM,,Outside typical range,F,The compound was evaluated for its inhibitory ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,1,C=C(C/C=C(\C)CCC=C(C)C)CCC(C)(C)/C=C/CC/C(C)=C...


In [69]:
data.to_csv('Data/assay/SA_MIC_nM_pchem_class.csv', index=False)

# ProQSAR

In [3]:
import pandas as pd

inhouse = pd.read_csv('Data/assay/Inhouse_lib.csv')
inhouse

Unnamed: 0,ID,Smiles
0,L001,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
1,L002,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...
2,L003,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...
3,L004,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...
4,L005,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...
...,...,...
156,L157,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
157,Vancomycin,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...
158,Ciprofloxacin,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
159,Methicillin,COc1cccc(OC)c1C(=O)N[C@@H]1C(=O)N2[C@@H]1SC(C)...


In [10]:
sa = pd.read_csv('Data/assay/SA_MIC_pchem_class.csv')
sa

Unnamed: 0,ID,Molecular Weight,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Assay Type,Assay Description,Assay Organism,Assay Variant Mutation,Target Name,Target Organism,Document Journal,pChEMBL,standardized_Smiles
0,CHEMBL262242,658.80,COc1cccc(C(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O...,MIC,'=',2.000000e-03,nM,,Outside typical range,F,Antibacterial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,1,COc1cccc(C(=O)Nc2cc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O...
1,CHEMBL4466320,455.42,O=C1c2ccccc2C(=O)c2c3c(cc(O)c21)[C@@]12O[C@@]1...,MIC,'=',2.000000e-06,ug.mL-1,,Outside typical range,F,Antimicrobial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Nat Prod,1,O=C1c2ccccc2C(=O)c2c3c(cc(O)c21)[C@@]12O[C@@]1...
2,CHEMBL374478,822.95,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(O)c(c(/C...,MIC,'=',6.000000e-06,ug.mL-1,,Outside typical range,F,Antimicrobial activity against methicillin-res...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Antimicrob Agents Chemother,1,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(O)c(c(C=...
3,CHEMBL2424893,478.58,COc1ccc2nccc(NC(=O)[C@H]3CC[C@H](NCc4ccc5c(n4)...,MIC,'=',1.500000e-05,ug.mL-1,,Outside typical range,F,Antibacterial activity against methicillin and...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,1,COc1ccc2nccc(NC(=O)C3CCC(NCc4ccc5c(n4)NC(=O)CS...
4,CHEMBL2165064,463.51,N#Cc1ccc2ccc(=O)n(CCN3CC[C@H](NCc4cc5c(cn4)OCC...,MIC,'=',1.500000e-05,ug.mL-1,,Outside typical range,F,Antibacterial activity against methicillin and...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,J Med Chem,1,N#Cc1ccc2ccc(=O)n(CCN3CC[C@H](NCc4cc5c(cn4)OCC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44537,CHEMBL1086831,376.44,CCOC(=O)c1cc(-c2sc(-c3cccnc3)nc2-c2ccccc2)n[nH]1,MIC,'=',1.300000e+06,ug.mL-1,,Outside typical range,F,Antibacterial activity against wild type Staph...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0,CCOC(=O)c1cc(-c2sc(-c3cccnc3)nc2-c2ccccc2)[nH]n1
44538,CHEMBL2367578,1582.60,C=C(C/C=C(\C)CCC=C(C)C)CCC(C)(C)/C=C/CC/C(C)=C...,MIC,'=',6.900000e+09,nM,,Outside typical range,F,The compound was evaluated for its inhibitory ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0,C=C(C/C=C(\C)CCC=C(C)C)CCC(C)(C)/C=C/CC/C(C)=C...
44539,CHEMBL1086830,340.41,CCOC(=O)c1cc(-c2sc(-c3cccnc3)nc2C2CC2)n[nH]1,MIC,'=',4.000000e+06,ug.mL-1,,Outside typical range,F,Antibacterial activity against wild type Staph...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0,CCOC(=O)c1cc(-c2sc(-c3cccnc3)nc2C2CC2)[nH]n1
44540,CHEMBL134040,625.63,CCOC(=O)c1cn(CC)c2cc(N3CCN(C(=O)c4ccc(N5C[C@H]...,MIC,'=',8.000000e+06,ug.mL-1,,Outside typical range,F,Antibacterial activity against Staphylococcus ...,Staphylococcus aureus,,Staphylococcus aureus,Staphylococcus aureus,Bioorg Med Chem Lett,0,CCOC(=O)c1cn(CC)c2cc(N3CCN(C(=O)c4ccc(N5C[C@H]...


In [46]:
sa.rename(columns={'Molecule ChEMBL ID': 'ID'}, inplace=True)
sa.to_csv('Data/assay/SA_MIC_pchem_class.csv', index=False)

In [12]:
from ProQSAR.qsar import ProQSAR
from ProQSAR.Config.config import Config
from ProQSAR.Featurizer.feature_generator import FeatureGenerator
import matplotlib
matplotlib.use("Agg")

smiles_col, id_col, activity_col = "Smiles", "ID", "pChEMBL"
feature_type = FeatureGenerator.get_all_types()

config = Config(
    featurizer={"feature_types": 'FCFP6'},
    splitter={'test_size': 0.1, 'option': 'stratified_random'},
    feature_selector={'select_method': 'ExtraTreesClassifier'},
    optimizer={'n_trials': 100, 'deactivate': True}
)
qsar = ProQSAR(activity_col, id_col, smiles_col, n_jobs=4, n_splits=5, n_repeats=5, config=config, keep_all_test=True, keep_all_pred=True, project_name="SA_MIC_class_stratified_random")
qsar.run_all(data_dev=sa, data_pred=inhouse, alpha=[0.05, 0.1, 0.2])


[17:07:36] Tautomer enumeration stopped at 376 tautomers: max transforms reached
[17:07:37] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[17:07:37] Tautomer enumeration stopped at 476 tautomers: max transforms reached
[17:07:37] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[17:07:37] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[17:07:38] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[17:07:38] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[17:07:38] Tautomer enumeration stopped at 345 tautomers: max transforms reached
[17:07:38] Tautomer enumeration stopped at 386 tautomers: max transforms reached
[17:07:38] Can't kekulize mol.  Unkekulized atoms: 2 3 6 9 10 28 31 32 33 34 37 38 39 41 42 44 47 48 51
[17:07:38] Can't kekulize mol.  Unkekulized atoms: 2 3 6 9 10 28 31 32 33 34 37 38 39 41 42 44 47 48 51
[17:07:38] Can't kekulize mol.  Unkekulized atoms: 2 3 6 9 10 2

In [1]:
from ProQSAR.qsar import ProQSAR

smiles_col, id_col, activity_col = "Smiles", "ID", "pChEMBL"
qsar = ProQSAR(activity_col, id_col, smiles_col).load('Project/SA_MIC_class_stratified_random (Copy)/proqsar.pkl')
qsar.__dict__

  from .autonotebook import tqdm as notebook_tqdm


{'activity_col': 'pChEMBL',
 'id_col': 'ID',
 'smiles_col': 'Smiles',
 'mol_col': 'mol',
 'project_name': 'SA_MIC_class_stratified_random',
 'n_jobs': 4,
 'random_state': 42,
 'scoring_target': None,
 'scoring_list': None,
 'n_splits': 5,
 'n_repeats': 5,
 'keep_all_test': True,
 'keep_all_pred': True,
 'config': <ProQSAR.Config.config.Config at 0x727f898ab7d0>,
 'save_dir': 'Project/SA_MIC_class_stratified_random',
 'logger': <RootLogger root (INFO)>,
 'shape_summary': {'FCFP6': {'Data': {'train': {'original': (40087, 4098),
     'duplicate': (36598, 4098),
     'missing': (36598, 4098),
     'lowvar': (36598, 4098),
     'univ_outlier': (36598, 4098),
     'kbin': (36598, 4098),
     'multiv_outlier': (28259, 4098),
     'rescaler': (28259, 4098),
     'feature_selector (ExtraTreesClassifier)': (28259, 905)}}}},
 'optimaldata': OptimalDataset(scoring_target=None, scoring_list=None, n_splits=5, n_repeats=5, save_cv_report=True, cv_report_name='cv_report_datasets', visualize=None, save

In [4]:
qsar.predict(data_pred=inhouse, alpha=[0.05, 0.1, 0.2])

[23:16:44] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached


Unnamed: 0,ID,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain
0,L001,1,"[0, 1]","[0, 1]",[],out
1,L002,1,"[0, 1]","[0, 1]",[],out
2,L003,1,"[0, 1]","[0, 1]",[],out
3,L004,1,"[0, 1]","[0, 1]",[],out
4,L005,0,"[0, 1]","[0, 1]",[],out
...,...,...,...,...,...,...
156,L157,1,"[0, 1]","[0, 1]",[],out
157,Vancomycin,1,[1],[1],[1],in
158,Ciprofloxacin,1,[1],[1],[1],in
159,Methicillin,1,[1],[1],[1],in


In [11]:
qsar.optimizer.n_trials = 250
qsar.model_dev.select_model = 'RandomForestClassifier'

In [12]:
qsar.optimize()



<ProQSAR.qsar.ProQSAR at 0x750cda1e1950>

In [13]:
report = qsar.model_dev.report
report[report['scoring'] == 'f1']

Unnamed: 0,scoring,cv_cycle,AdaBoostClassifier,CatBoostClassifier,DummyClassifier,ExtraTreesClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,MLPClassifier,RandomForestClassifier,RandomForestClassifier_opt_1,RandomForestClassifier_opt_2,SVC,XGBClassifier
56,f1,1,0.764342,0.839368,0.68665,0.859963,0.771126,0.850259,0.796379,0.844306,0.860157,0.849791,0.849286,0.848383,0.845681
57,f1,2,0.764394,0.838458,0.686498,0.859744,0.77338,0.852591,0.798983,0.847452,0.861088,0.847335,0.847481,0.84466,0.844994
58,f1,3,0.741391,0.832342,0.686498,0.859415,0.762235,0.849958,0.792072,0.848567,0.857337,0.843332,0.845017,0.842818,0.841227
59,f1,4,0.762637,0.83998,0.686498,0.860567,0.778987,0.855511,0.801782,0.850571,0.861964,0.855766,0.855225,0.84931,0.843054
60,f1,5,0.756719,0.83637,0.686578,0.859375,0.770368,0.850571,0.797428,0.847429,0.860007,0.847558,0.848117,0.850168,0.841675
61,f1,6,0.756597,0.839283,0.68665,0.863652,0.770414,0.850598,0.793157,0.846103,0.864185,0.855103,0.85565,0.848964,0.845269
62,f1,7,0.757931,0.836222,0.686498,0.855539,0.769124,0.845793,0.8,0.851609,0.857434,0.847588,0.848495,0.846944,0.833561
63,f1,8,0.759884,0.8423,0.686498,0.860696,0.775098,0.861088,0.801543,0.847019,0.860453,0.852049,0.850823,0.849649,0.849352
64,f1,9,0.756186,0.831464,0.686498,0.853166,0.762756,0.849132,0.791239,0.842653,0.854234,0.837242,0.836635,0.843739,0.834126
65,f1,10,0.769918,0.840684,0.686578,0.86309,0.776171,0.856999,0.8,0.855316,0.865727,0.856299,0.855951,0.850532,0.845865


In [5]:
import pandas as pd

cv_data = pd.read_csv("Project/SA_MIC_class_stratified_random/cv_report_model.csv")
cv_data

Unnamed: 0,scoring,cv_cycle,AdaBoostClassifier,CatBoostClassifier,DummyClassifier,ExtraTreesClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,MLPClassifier,RandomForestClassifier,SVC,XGBClassifier
0,accuracy,1,0.757254,0.832803,0.522824,0.854388,0.766631,0.841649,0.789101,0.840587,0.854565,0.840764,0.840057
1,accuracy,2,0.756016,0.831741,0.522647,0.852795,0.771054,0.844480,0.790163,0.836872,0.854034,0.835810,0.838110
2,accuracy,3,0.734253,0.825372,0.522647,0.852972,0.757608,0.841649,0.784678,0.842003,0.851557,0.835810,0.835103
3,accuracy,4,0.754069,0.833156,0.522647,0.853857,0.773708,0.847841,0.795294,0.842534,0.855272,0.841472,0.836695
4,accuracy,5,0.751725,0.829942,0.522739,0.853477,0.769067,0.842506,0.788179,0.838790,0.854185,0.842506,0.835427
...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,roc_auc,24,0.826714,0.905653,0.500000,0.917960,0.849240,0.904818,0.864562,0.904862,0.921211,0.909867,0.911141
248,roc_auc,25,0.831071,0.912346,0.500000,0.926880,0.848601,0.920089,0.863501,0.918313,0.931172,0.915485,0.916257
249,roc_auc,mean,0.827763,0.907220,0.500000,0.922276,0.845525,0.910626,0.863891,0.910762,0.925802,0.911457,0.911420
250,roc_auc,median,0.827716,0.907136,0.500000,0.922280,0.845660,0.911217,0.863501,0.911211,0.925567,0.911805,0.911141


In [38]:
qsar.optimaldata.report = cv_data
qsar.analysis()

  figure, axes = plt.subplots(
  max_fold_diff = variances_by_method.max() / variances_by_method.min()
  max_fold_diff = variances_by_method.max() / variances_by_method.min()
  max_fold_diff = variances_by_method.max() / variances_by_method.min()


In [3]:
import pandas as pd

a = pd.read_csv('Project/SA_MIC_class_stratified_random/cv_report_model.csv')
a[a['scoring'] == 'f1']
#a

Unnamed: 0,scoring,cv_cycle,AdaBoostClassifier,CatBoostClassifier,DummyClassifier,ExtraTreesClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,MLPClassifier,RandomForestClassifier,SVC,XGBClassifier
56,f1,1,0.764342,0.839368,0.68665,0.859963,0.771126,0.850259,0.796379,0.844306,0.860157,0.848383,0.845681
57,f1,2,0.764394,0.838458,0.686498,0.859744,0.77338,0.852591,0.798983,0.847452,0.861088,0.84466,0.844994
58,f1,3,0.741391,0.832342,0.686498,0.859415,0.762235,0.849958,0.792072,0.848567,0.857337,0.842818,0.841227
59,f1,4,0.762637,0.83998,0.686498,0.860567,0.778987,0.855511,0.801782,0.850571,0.861964,0.84931,0.843054
60,f1,5,0.756719,0.83637,0.686578,0.859375,0.770368,0.850571,0.797428,0.847429,0.860007,0.850168,0.841675
61,f1,6,0.756597,0.839283,0.68665,0.863652,0.770414,0.850598,0.793157,0.846103,0.864185,0.848964,0.845269
62,f1,7,0.757931,0.836222,0.686498,0.855539,0.769124,0.845793,0.8,0.851609,0.857434,0.846944,0.833561
63,f1,8,0.759884,0.8423,0.686498,0.860696,0.775098,0.861088,0.801543,0.847019,0.860453,0.849649,0.849352
64,f1,9,0.756186,0.831464,0.686498,0.853166,0.762756,0.849132,0.791239,0.842653,0.854234,0.843739,0.834126
65,f1,10,0.769918,0.840684,0.686578,0.86309,0.776171,0.856999,0.8,0.855316,0.865727,0.850532,0.845865


In [5]:
data_pred = pd.read_csv('Project/SA_MIC_class_stratified_random/PredResult/data_pred.csv')
data_pred

Unnamed: 0,ID,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain
0,L001,1,[0 1],[0 1],[],out
1,L002,1,[0 1],[0 1],[],out
2,L003,1,[0 1],[0 1],[],out
3,L004,1,[0 1],[0 1],[],out
4,L005,0,[0 1],[0 1],[],out
...,...,...,...,...,...,...
156,L157,1,[0 1],[0 1],[],out
157,Vancomycin,1,[1],[1],[1],in
158,Ciprofloxacin,1,[1],[1],[1],in
159,Methicillin,1,[1],[1],[1],in


In [6]:
pred_data = data_pred.merge(inhouse, on='ID', how='left')
pred_data

Unnamed: 0,ID,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain,Smiles
0,L001,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
1,L002,1,[0 1],[0 1],[],out,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...
2,L003,1,[0 1],[0 1],[],out,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...
3,L004,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...
4,L005,0,[0 1],[0 1],[],out,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...
...,...,...,...,...,...,...,...
156,L157,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
157,Vancomycin,1,[1],[1],[1],in,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...
158,Ciprofloxacin,1,[1],[1],[1],in,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
159,Methicillin,1,[1],[1],[1],in,COc1cccc(OC)c1C(=O)N[C@@H]1C(=O)N2[C@@H]1SC(C)...


In [13]:
a5 = pred_data.loc[1, 'Prediction Interval (alpha=0.05)']
a5

'[3.572 6.338]'

In [10]:
import pandas as pd
import numpy as np

values = np.array(a5.strip('[]').split(), dtype=float)
values

AttributeError: 'numpy.ndarray' object has no attribute 'strip'

In [45]:
b = 10**(values) * 1e3
#b.sort()
b

array([2.80543364e+06, 9.88553095e+08])

In [23]:
from copy import deepcopy

df = deepcopy(pred_data)

In [14]:
import pandas as pd
import numpy as np

# Sample DataFrame


# Function to convert pChEMBL to mM
def pchembl_to_mM(value):
    return 10**(-value) * 1e3

# Apply conversion to Predicted value column
df['Predicted value (mM)'] = df['Predicted value'].apply(pchembl_to_mM)

# Convert Prediction Intervals from string to actual values
def convert_interval(interval_str):
    values = np.array(interval_str.strip('[]').split(), dtype=float)
    #values=interval_str
    return np.sort(pchembl_to_mM(values)).round(4)

df['Prediction Interval (alpha=0.05) (mM)'] = df['Prediction Interval (alpha=0.05)'].apply(convert_interval)
df['Prediction Interval (alpha=0.1) (mM)'] = df['Prediction Interval (alpha=0.1)'].apply(convert_interval)
df['Prediction Interval (alpha=0.2) (mM)'] = df['Prediction Interval (alpha=0.2)'].apply(convert_interval)

# Print the updated DataFrame
df

Unnamed: 0,ID,Predicted value,Prediction Interval (alpha=0.05),Prediction Interval (alpha=0.1),Prediction Interval (alpha=0.2),Applicability domain,Smiles,Predicted value (mM),Prediction Interval (alpha=0.05) (mM),Prediction Interval (alpha=0.1) (mM),Prediction Interval (alpha=0.2) (mM)
0,L001,4.759679,[3.666 6.412],[3.971 6.084],[4.255 5.791],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...,0.017391,"[0.0004, 0.2158]","[0.0008, 0.1069]","[0.0016, 0.0556]"
1,L002,4.866846,[3.572 6.338],[3.872 6.028],[4.153 5.721],in,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...,0.013588,"[0.0005, 0.2679]","[0.0009, 0.1343]","[0.0019, 0.0703]"
2,L003,4.845051,[3.576 6.289],[3.877 5.971],[4.154 5.676],in,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...,0.014287,"[0.0005, 0.2655]","[0.0011, 0.1327]","[0.0021, 0.0701]"
3,L004,4.823231,[3.568 6.348],[3.862 6.043],[4.147 5.738],in,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...,0.015023,"[0.0004, 0.2704]","[0.0009, 0.1374]","[0.0018, 0.0713]"
4,L005,4.783527,[3.598 6.413],[3.911 6.115],[4.201 5.808],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...,0.016462,"[0.0004, 0.2523]","[0.0008, 0.1227]","[0.0016, 0.063]"
...,...,...,...,...,...,...,...,...,...,...,...
152,L153,5.286132,[3.809 6.541],[4.128 6.248],[4.421 5.945],in,C[C@@H](O)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...,0.005174,"[0.0003, 0.1552]","[0.0006, 0.0745]","[0.0011, 0.0379]"
153,L154,5.238542,[3.938 6.657],[4.233 6.374],[4.514 6.081],in,N=C(N)NCCC[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...,0.005774,"[0.0002, 0.1153]","[0.0004, 0.0585]","[0.0008, 0.0306]"
154,L155,5.544348,[4.097 6.786],[4.386 6.482],[4.679 6.197],in,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=C(Br)C=C...,0.002855,"[0.0002, 0.08]","[0.0003, 0.0411]","[0.0006, 0.0209]"
155,L156,5.411586,[4.086 6.779],[4.385 6.478],[4.669 6.179],in,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=C(B...,0.003876,"[0.0002, 0.082]","[0.0003, 0.0412]","[0.0007, 0.0214]"


In [7]:
pred_data.to_csv('Project/SA_MIC_class_stratified_random/PredResult/data_pred_inhouse.csv', index=False)

In [8]:
pd.read_csv('Project/SA_MIC_class_stratified_random/PredResult/data_pred_inhouse.csv')

Unnamed: 0,ID,Predicted value,Prediction Set (alpha=0.05),Prediction Set (alpha=0.1),Prediction Set (alpha=0.2),Applicability domain,Smiles
0,L001,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
1,L002,1,[0 1],[0 1],[],out,OC1=CC=C(C=C1)C[C@@H](C(O)=O)NCC2=CN(C3=CC=CC(...
2,L003,1,[0 1],[0 1],[],out,CSCC[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+]([O-])=...
3,L004,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CC=CC=C1)NCC2=CN(C3=CC=CC([N+]...
4,L005,0,[0 1],[0 1],[],out,CC[C@H](C)[C@@H](C(O)=O)NCC1=CN(C2=CC=CC([N+](...
...,...,...,...,...,...,...,...
156,L157,1,[0 1],[0 1],[],out,O=C(O)[C@H](CC1=CNC2=C1C=CC=C2)NCC3=CN(C4=CC=C...
157,Vancomycin,1,[1],[1],[1],in,CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=...
158,Ciprofloxacin,1,[1],[1],[1],in,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O
159,Methicillin,1,[1],[1],[1],in,COc1cccc(OC)c1C(=O)N[C@@H]1C(=O)N2[C@@H]1SC(C)...
