# Create database of ready biodegradable 

## Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import openpyxl

from math import log10
import time
from itertools import groupby
from IPython.display import Image
from functools import reduce

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.Chem.Draw.MolDrawing import DrawingOptions
DrawingOptions.includeAtomNumbers=True
from rdkit.Chem.Draw import rdMolDraw2D


In [2]:
#specify directory for project containing subfolders 
#other_qsar_results_merged_vega, other_qsar_results_merged, in_silico_generation_copies
#for annex to work alsoother_qsar_res_watsol, veg_addprop_chunks_res, VEGA_muta_all_res
Projectdir = "C:\\Users\\Hannah\\OneDrive - Uva\\Master UvA\\master project\\design_additions_after_msc"
os.chdir(Projectdir)
print("Current Working Directory: " , os.getcwd())

Current Working Directory:  C:\Users\Hannah\OneDrive - Uva\Master UvA\master project\design_additions_after_msc


### General Functions and parameters

In [3]:
#filter out ready biodegradable structures
#Ready biodegradability cut-off values as defined in Biowin manual
readyBw3 = 3.25
readyBw5 = 0.5 #>=

def ready_biodegradable(dfnew):
    '''returns df with those molecules which are ready biodegradable'''
    df = dfnew[(dfnew['Biowin3'].astype(float) >= readyBw3) & (dfnew['Biowin5'].astype(float) >= readyBw5) ]
    df.dropna
    df.sort_values('Biowin3', ascending=False)
    #print(df)
    if df.empty:
        print('DataFrame is empty, no ready biodegradable molecules')
    else:
        print(len(df.index),'ready biodegradable molecules in table, out of ',len(dfnew.index),'total')
        return(df)

##### filter out subgroup containing mols#############

def smilestomollist(smileslist):
    mollist = []
    for smiles in smileslist:
        mol = Chem.MolFromSmiles(smiles)
        mollist.append(mol)
    return(mollist)

Enolsmarts = '[OX2H][#6X3]=[#6]'

def filteroutSmartsbool(mollist, Smarts):
    '''input list of rd-mols! Filters out molecules with specified smarts code'''
    pattern = Chem.MolFromSmarts(Smarts)
    molsfiltered = 0
    newlist = []
    for mol in mollist:
        if mol.HasSubstructMatch(pattern):
            molsfiltered += 1
            newlist.append(False)
        else:
            newlist.append(True)
    print(molsfiltered,'structures with specified substructure (smarts) were filtered out.')
    return(newlist)
   
    
def filteroutSmartsdf(dfin, smarts):
    '''returns new df with molecules from df without OH-groups'''
    df = dfin[filteroutSmartsbool(smilestomollist(dfin['SMILES']),smarts)]
    df.dropna
    print('Molecules without specified group in new DataFrame: ', len(df.index))
    if df.empty:
        print('DataFrame is empty, all molecules contained group')
    else:
        return(df)

## Create dataset of ready biodegradable structures from Biowin results for all generated structures

In [4]:
#create a df with all biowin results per smiles
#no S! biowin smiles notation problem
biowin_dir = Projectdir+'\\in_silico_generation_copies\\biowin_est'
biowin_files = os.listdir(biowin_dir)
biowin_df_to_append = []
for file in biowin_files:
    file_biowin = pd.read_csv(str(biowin_dir+'\\'+file),
                         sep="\t", header=None,
                         names=["Biowin1", "Biowin2","Biowin3", "Biowin4","Biowin5","Biowin6","Biowin7","SMILES"])
    biowin_df_to_append.append(file_biowin)

biowin_per_smiles = pd.concat(biowin_df_to_append)

In [5]:
#create new dataframe with only those molecules which are ready biodegradable according to Biowin 3 and 5, reorder columns
smiles_biowin_ready_biodeg = ready_biodegradable(biowin_per_smiles)[["SMILES","Biowin1", "Biowin2","Biowin3", "Biowin4","Biowin5","Biowin6","Biowin7"]]
smiles_biowin_ready_biodeg

46013 ready biodegradable molecules in table, out of  6362110 total


Unnamed: 0,SMILES,Biowin1,Biowin2,Biowin3,Biowin4,Biowin5,Biowin6,Biowin7
1607,CCCCOP(=O)(OCN)OCCCC,1.3183,1.0,3.4453,4.5493,0.5351,0.3799,0.9277
3438,CCCCCOP(=O)(OCN)OCCCC,1.3116,1.0,3.4143,4.5291,0.5428,0.3857,0.9537
5726,CCCCOP(=O)(OCCN)OCCCC,1.3116,1.0,3.4143,4.5291,0.5428,0.3857,0.9537
7396,CCCCCOP(=O)(OC)OCNCCCC,1.3049,1.0,3.3833,4.5088,0.5015,0.2992,0.8741
7439,CCCCCNCOP(=O)(OC)OCCCC,1.3049,1.0,3.3833,4.5088,0.5015,0.2992,0.8741
...,...,...,...,...,...,...,...,...
3087659,CC(O)OP1(=O)OC(O)C#CC(O)O1,1.4310,1.0,3.3376,4.3781,0.6582,0.4650,1.5949
3087660,O=P12OC(O)C=C(CC(O)O1)C(O)O2,1.4310,1.0,3.3376,4.3781,0.7468,0.5189,1.6862
3087670,O=P12OC(O)C3C(C(O)O1)C3C(O)O2,1.4310,1.0,3.3376,4.3781,0.7582,0.3594,1.9981
3087863,O=C(O)COP1(=O)OCC(=CO)CO1,1.1862,1.0,3.3822,4.5048,0.6375,0.5637,1.3951


### Filter out Enols
OMG generated structures containing enol moieties, which would tautomerize to their keto form (which is also included in the generated structures), and are thus generally not chemically stable. 


In [6]:
#filter out enols
smiles_biowin_ready_biodeg_noenols = filteroutSmartsdf(smiles_biowin_ready_biodeg, Enolsmarts)

13663 structures with specified substructure (smarts) were filtered out.
Molecules without specified group in new DataFrame:  32350


### Import QSAR results for further properties
For the generated dataset, further properties were estimated with QSAR models from EPISUITE and VEGA. Firstly, estimates from EPISUITE models are added to the database, as well as the mutagenicity consensus model results from VEGA (all results in 'other_qsar_results_merged'subfolder). While EPISUITE outputs SMILES identical to the input, VEGA unifies the SMILES format within their models, so that input and output SMILES are not identical. Therefore, for this one VEGA model result file, the input SMILES were matched to the results output SMILES by index (see Annex 1a). Below the mutagenicity results are, like other EPISUITE model results, added to the database by SMILES. Estimate results from other VEGA models are then added by vegaSMILES, the unified output SMILES format from VEGA.

In [7]:
#import qsar results
qsar_res_dir = Projectdir+'\\other_qsar_results_merged'
qsar_res_files = os.listdir(qsar_res_dir)
res_df_to_append = [smiles_biowin_ready_biodeg_noenols]

for file in qsar_res_files:
    file_res = pd.read_csv(str(qsar_res_dir+'\\'+file),
                         sep="\t")
    res_df_to_append.append(file_res)
    
smiles_readys_qsar_res = reduce(lambda left,right: pd.merge(left,right,on='SMILES'), res_df_to_append)
print(smiles_readys_qsar_res.columns)
smiles_readys_qsar_res

Index(['SMILES', 'Biowin1', 'Biowin2', 'Biowin3', 'Biowin4', 'Biowin5',
       'Biowin6', 'Biowin7', 'BCF', 'log BCF', 'Bio Half-life',
       'log Bio Half-life', 'kM rate const', 'BAF upper trophic',
       'log BAF upper trophic', 'log BAF middle trophic',
       'log BAF lower trophic', 'Koc', 'log Kow', 'No.', 'Id', 'vegaSMILES',
       'Assessment', 'Used models', 'Predicted Consensus Mutagen activity',
       'Mutagenic Score', 'Non-Mutagenic Score', 'Model Caesar assessment',
       'Model ISS assessment', 'Model SarPy assessment',
       'Model KNN assessment', 'Remarks'],
      dtype='object')


Unnamed: 0,SMILES,Biowin1,Biowin2,Biowin3,Biowin4,Biowin5,Biowin6,Biowin7,BCF,log BCF,...,Assessment,Used models,Predicted Consensus Mutagen activity,Mutagenic Score,Non-Mutagenic Score,Model Caesar assessment,Model ISS assessment,Model SarPy assessment,Model KNN assessment,Remarks
0,CCCCOP(=O)(OCN)OCCCC,1.3183,1.0,3.4453,4.5493,0.5351,0.3799,0.9277,0.6043,-0.22,...,NON-Mutagenic (Consensus score: 0.375),4,NON-Mutagenic,0.10,0.375,NON-Mutagenic (good reliability),Mutagenic (low reliability),Mutagenic (low reliability),NON-Mutagenic (moderate reliability),-
1,CCCCCOP(=O)(OCN)OCCCC,1.3116,1.0,3.4143,4.5291,0.5428,0.3857,0.9537,1.2740,0.11,...,NON-Mutagenic (Consensus score: 0.525),4,NON-Mutagenic,0.05,0.525,NON-Mutagenic (good reliability),NON-Mutagenic (moderate reliability),Mutagenic (low reliability),NON-Mutagenic (moderate reliability),-
2,CCCCOP(=O)(OCCN)OCCCC,1.3116,1.0,3.4143,4.5291,0.5428,0.3857,0.9537,0.5531,-0.26,...,NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,0.15,0.600,NON-Mutagenic (good reliability),Mutagenic (moderate reliability),Possible NON-Mutagenic (good reliability),NON-Mutagenic (moderate reliability),-
3,CCCCCOP(=O)(OC)OCNCCCC,1.3049,1.0,3.3833,4.5088,0.5015,0.2992,0.8741,2.5860,0.41,...,NON-Mutagenic (Consensus score: 0.35),4,NON-Mutagenic,0.05,0.350,NON-Mutagenic (moderate reliability),NON-Mutagenic (low reliability),Mutagenic (low reliability),NON-Mutagenic (moderate reliability),-
4,CCCCCNCOP(=O)(OC)OCCCC,1.3049,1.0,3.3833,4.5088,0.5015,0.2992,0.8741,2.5860,0.41,...,NON-Mutagenic (Consensus score: 0.3),4,NON-Mutagenic,0.10,0.300,NON-Mutagenic (moderate reliability),Mutagenic (low reliability),Mutagenic (low reliability),NON-Mutagenic (moderate reliability),-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32345,O=C(O)C(O)OP1(=O)OCC=CCO1,1.1862,1.0,3.3822,4.5048,0.5436,0.4392,1.1297,3.1620,0.50,...,NON-Mutagenic (Consensus score: 0.45),4,NON-Mutagenic,0.05,0.450,Mutagenic (low reliability),NON-Mutagenic (moderate reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-
32346,O=P1(OC(O)C(O)O)OCC#CCO1,1.4310,1.0,3.3376,4.3781,0.6217,0.6160,1.1897,3.1620,0.50,...,NON-Mutagenic (Consensus score: 0.5),4,NON-Mutagenic,0.00,0.500,NON-Mutagenic (moderate reliability),NON-Mutagenic (low reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-
32347,CC(O)OP1(=O)OC(O)C#CC(O)O1,1.4310,1.0,3.3376,4.3781,0.6582,0.4650,1.5949,3.1620,0.50,...,NON-Mutagenic (Consensus score: 0.5),4,NON-Mutagenic,0.00,0.500,NON-Mutagenic (moderate reliability),NON-Mutagenic (low reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-
32348,O=P12OC(O)C=C(CC(O)O1)C(O)O2,1.4310,1.0,3.3376,4.3781,0.7468,0.5189,1.6862,3.1620,0.50,...,NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,0.00,0.600,NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-


In [8]:
#add VEGA model results to database
qsar_res_dir_vega = Projectdir+'\\other_qsar_results_merged_vega'
qsar_res_vega_files = os.listdir(qsar_res_dir_vega)
vega_res_df_to_append = [smiles_readys_qsar_res]

for file in qsar_res_vega_files:
    file_res = pd.read_csv(str(qsar_res_dir_vega+'\\'+file),
                         sep="\t")
    vega_res_df_to_append.append(file_res)
    
smiles_readys_qsar_res_incl_vega = reduce(lambda left,right: pd.merge(left,right,on='vegaSMILES'), vega_res_df_to_append)
print(smiles_readys_qsar_res_incl_vega.columns)
smiles_readys_qsar_res_incl_vega

Index(['SMILES', 'Biowin1', 'Biowin2', 'Biowin3', 'Biowin4', 'Biowin5',
       'Biowin6', 'Biowin7', 'BCF', 'log BCF', 'Bio Half-life',
       'log Bio Half-life', 'kM rate const', 'BAF upper trophic',
       'log BAF upper trophic', 'log BAF middle trophic',
       'log BAF lower trophic', 'Koc', 'log Kow', 'No._x', 'Id_x',
       'vegaSMILES', 'Assessment', 'Used models',
       'Predicted Consensus Mutagen activity', 'Mutagenic Score',
       'Non-Mutagenic Score', 'Model Caesar assessment',
       'Model ISS assessment', 'Model SarPy assessment',
       'Model KNN assessment', 'Remarks', 'No._y', 'Id_y',
       'Estrogen Receptor-mediated effect (IRFMN/CERAPP) - assessment',
       'Estrogen Receptor-mediated effect (IRFMN/CERAPP) - prediction',
       'Androgen Receptor-mediated effect (IRFMN/COMPARA) - assessment',
       'Androgen Receptor-mediated effect (IRFMN/COMPARA) - prediction',
       'Thyroid Receptor Alpha effect (NRMEA) - assessment',
       'Thyroid Receptor Alpha ef

Unnamed: 0,SMILES,Biowin1,Biowin2,Biowin3,Biowin4,Biowin5,Biowin6,Biowin7,BCF,log BCF,...,Androgen Receptor-mediated effect (IRFMN/COMPARA) - assessment,Androgen Receptor-mediated effect (IRFMN/COMPARA) - prediction,Thyroid Receptor Alpha effect (NRMEA) - assessment,Thyroid Receptor Alpha effect (NRMEA) - prediction,Thyroid Receptor Beta effect (NRMEA) - assessment,Thyroid Receptor Beta effect (NRMEA) - prediction,No.,Id,KOC model (OPERA) - assessment,KOC model (OPERA) - prediction [log(L/Kg)]
0,CCCCOP(=O)(OCN)OCCCC,1.3183,1.0,3.4453,4.5493,0.5351,0.3799,0.9277,0.6043,-0.22,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,53,Molecule 53,2.4243 log(L/Kg) (low reliability),2.4243
1,CCCCCOP(=O)(OCN)OCCCC,1.3116,1.0,3.4143,4.5291,0.5428,0.3857,0.9537,1.2740,0.11,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,54,Molecule 54,2.3714 log(L/Kg) (low reliability),2.3714
2,CCCCOP(=O)(OCCN)OCCCC,1.3116,1.0,3.4143,4.5291,0.5428,0.3857,0.9537,0.5531,-0.26,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,55,Molecule 55,2.4166 log(L/Kg) (moderate reliability),2.4166
3,CCCCCOP(=O)(OC)OCNCCCC,1.3049,1.0,3.3833,4.5088,0.5015,0.2992,0.8741,2.5860,0.41,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,56,Molecule 56,2.7467 log(L/Kg) (moderate reliability),2.7467
4,CCCCCNCOP(=O)(OC)OCCCC,1.3049,1.0,3.3833,4.5088,0.5015,0.2992,0.8741,2.5860,0.41,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,57,Molecule 57,2.7454 log(L/Kg) (moderate reliability),2.7454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32345,O=C(O)C(O)OP1(=O)OCC=CCO1,1.1862,1.0,3.3822,4.5048,0.5436,0.4392,1.1297,3.1620,0.50,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,2346,Molecule 2346,1.7404 log(L/Kg) (low reliability),1.7404
32346,O=P1(OC(O)C(O)O)OCC#CCO1,1.4310,1.0,3.3376,4.3781,0.6217,0.6160,1.1897,3.1620,0.50,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,2347,Molecule 2347,1.9137 log(L/Kg) (low reliability),1.9137
32347,CC(O)OP1(=O)OC(O)C#CC(O)O1,1.4310,1.0,3.3376,4.3781,0.6582,0.4650,1.5949,3.1620,0.50,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,2348,Molecule 2348,1.9226 log(L/Kg) (low reliability),1.9226
32348,O=P12OC(O)C=C(CC(O)O1)C(O)O2,1.4310,1.0,3.3376,4.3781,0.7468,0.5189,1.6862,3.1620,0.50,...,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive,2349,Molecule 2349,1.8352 log(L/Kg) (low reliability),1.8352


### Final transformations for dataset for MAUT assessment
The Koc output by EPISUITE KOCWIN is tranformed to log Koc, Inchi and InchiKeys are added to the dataset as unique identifiers for searchability and finally, the relevant columns for the MAUT assessment are selected and the dataset is exported.

In [9]:
#transform episuite KOC estimate to log Koc
smiles_readys_qsar_res_incl_vega['log koc'] = np.log10(smiles_readys_qsar_res_incl_vega['Koc'])

In [10]:
#Add Inchi Keys for searchability of data
smiles_readys_qsar_res_incl_vega['Inchi'] = [Chem.MolToInchi(Chem.MolFromSmiles(smiles_readys_qsar_res_incl_vega['SMILES'][x])) for x in smiles_readys_qsar_res_incl_vega.index]
smiles_readys_qsar_res_incl_vega['Inchikey'] = [Chem.MolToInchiKey(Chem.MolFromSmiles(smiles_readys_qsar_res_incl_vega['SMILES'][x])) for x in smiles_readys_qsar_res_incl_vega.index]

In [11]:
#select relevant columns for MAUT assessment
smiles_properties = smiles_readys_qsar_res_incl_vega[['SMILES', 'Inchi', 'Inchikey', 'vegaSMILES',
                                                      'log Kow',  'log koc','BCF',
        'Assessment', 'Used models',
       'Predicted Consensus Mutagen activity', 'Mutagenic Score',
       'Non-Mutagenic Score', 'Model Caesar assessment',
       'Model ISS assessment', 'Model SarPy assessment',
       'Model KNN assessment', 'Remarks', 'Biowin1', 'Biowin2', 'Biowin3',
       'Biowin4', 'Biowin5', 'Biowin6', 'Biowin7',
       'KOC model (OPERA) - assessment',
       'KOC model (OPERA) - prediction [log(L/Kg)]',
       'Estrogen Receptor-mediated effect (IRFMN/CERAPP) - assessment',
       'Estrogen Receptor-mediated effect (IRFMN/CERAPP) - prediction',
       'Androgen Receptor-mediated effect (IRFMN/COMPARA) - assessment',
       'Androgen Receptor-mediated effect (IRFMN/COMPARA) - prediction',
       'Thyroid Receptor Alpha effect (NRMEA) - assessment',
       'Thyroid Receptor Alpha effect (NRMEA) - prediction',
       'Thyroid Receptor Beta effect (NRMEA) - assessment',
       'Thyroid Receptor Beta effect (NRMEA) - prediction']]
smiles_properties

Unnamed: 0,SMILES,Inchi,Inchikey,vegaSMILES,log Kow,log koc,BCF,Assessment,Used models,Predicted Consensus Mutagen activity,...,KOC model (OPERA) - assessment,KOC model (OPERA) - prediction [log(L/Kg)],Estrogen Receptor-mediated effect (IRFMN/CERAPP) - assessment,Estrogen Receptor-mediated effect (IRFMN/CERAPP) - prediction,Androgen Receptor-mediated effect (IRFMN/COMPARA) - assessment,Androgen Receptor-mediated effect (IRFMN/COMPARA) - prediction,Thyroid Receptor Alpha effect (NRMEA) - assessment,Thyroid Receptor Alpha effect (NRMEA) - prediction,Thyroid Receptor Beta effect (NRMEA) - assessment,Thyroid Receptor Beta effect (NRMEA) - prediction
0,CCCCOP(=O)(OCN)OCCCC,"InChI=1S/C9H22NO4P/c1-3-5-7-12-15(11,14-9-10)1...",FQZURDOEFGTKDH-UHFFFAOYSA-N,O=P(OCN)(OCCCC)OCCCC,1.42,2.637089,0.6043,NON-Mutagenic (Consensus score: 0.375),4,NON-Mutagenic,...,2.4243 log(L/Kg) (low reliability),2.4243,NON-active (low reliability),NON-active,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
1,CCCCCOP(=O)(OCN)OCCCC,"InChI=1S/C10H24NO4P/c1-3-5-7-9-14-16(12,15-10-...",HFYGOQQKIGXYQR-UHFFFAOYSA-N,O=P(OCN)(OCCCC)OCCCCC,1.91,2.897737,1.2740,NON-Mutagenic (Consensus score: 0.525),4,NON-Mutagenic,...,2.3714 log(L/Kg) (low reliability),2.3714,NON-active (low reliability),NON-active,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
2,CCCCOP(=O)(OCCN)OCCCC,"InChI=1S/C10H24NO4P/c1-3-5-8-13-16(12,15-10-7-...",JDDLUVXBUHFTCI-UHFFFAOYSA-N,O=P(OCCN)(OCCCC)OCCCC,1.37,2.897737,0.5531,NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,...,2.4166 log(L/Kg) (moderate reliability),2.4166,Not predicted (low reliability),Not predicted,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
3,CCCCCOP(=O)(OC)OCNCCCC,"InChI=1S/C11H26NO4P/c1-4-6-8-10-15-17(13,14-3)...",PZRNSLQIRKYCIG-UHFFFAOYSA-N,O=P(OC)(OCNCCCC)OCCCCC,2.38,2.945665,2.5860,NON-Mutagenic (Consensus score: 0.35),4,NON-Mutagenic,...,2.7467 log(L/Kg) (moderate reliability),2.7467,NON-active (moderate reliability),NON-active,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
4,CCCCCNCOP(=O)(OC)OCCCC,"InChI=1S/C11H26NO4P/c1-4-6-8-9-12-11-16-17(13,...",FIXQROCYSXZKQD-UHFFFAOYSA-N,O=P(OC)(OCNCCCCC)OCCCC,2.38,2.945665,2.5860,NON-Mutagenic (Consensus score: 0.3),4,NON-Mutagenic,...,2.7454 log(L/Kg) (moderate reliability),2.7454,NON-active (moderate reliability),NON-active,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32345,O=C(O)C(O)OP1(=O)OCC=CCO1,InChI=1S/C6H9O7P/c7-5(8)6(9)13-14(10)11-3-1-2-...,YKDXEAODOOQRKH-UHFFFAOYSA-N,O=C(O)C(O)OP1(=O)(OCC=CCO1),-1.92,1.000000,3.1620,NON-Mutagenic (Consensus score: 0.45),4,NON-Mutagenic,...,1.7404 log(L/Kg) (low reliability),1.7404,Possible NON-active (good reliability),Possible NON-active,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
32346,O=P1(OC(O)C(O)O)OCC#CCO1,InChI=1S/C6H9O7P/c7-5(8)6(9)13-14(10)11-3-1-2-...,DBXVKSMVVGDYLL-UHFFFAOYSA-N,O=P1(OCC#CCO1)OC(O)C(O)O,-2.60,1.000000,3.1620,NON-Mutagenic (Consensus score: 0.5),4,NON-Mutagenic,...,1.9137 log(L/Kg) (low reliability),1.9137,Possible NON-active (moderate reliability),Possible NON-active,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
32347,CC(O)OP1(=O)OC(O)C#CC(O)O1,InChI=1S/C6H9O7P/c1-4(7)11-14(10)12-5(8)2-3-6(...,IGGCCSLTWVYJSI-UHFFFAOYSA-N,O=P1(OC(O)C#CC(O)O1)OC(O)C,-4.16,1.000000,3.1620,NON-Mutagenic (Consensus score: 0.5),4,NON-Mutagenic,...,1.9226 log(L/Kg) (low reliability),1.9226,Not predicted (low reliability),Not predicted,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive
32348,O=P12OC(O)C=C(CC(O)O1)C(O)O2,"InChI=1S/C6H9O7P/c7-4-1-3-2-5(8)12-14(10,11-4)...",XWMMMUYOPPWCTR-UHFFFAOYSA-N,O=P12(OC(O)C=C(CC(O)O1)C(O)O2),-4.06,1.000000,3.1620,NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,...,1.8352 log(L/Kg) (low reliability),1.8352,Not predicted (low reliability),Not predicted,NON-active (moderate reliability),NON-active,Inactive (good reliability),Inactive,Inactive (good reliability),Inactive


#### Export database

In [12]:
smiles_properties.to_csv(Projectdir+"\\insilico_smiles_properties.txt", sep='\t')

# Annex 

# Annex 1a: VEGA Mutagenicity - merging on index 
For the molecules in the dataset the mutagenicity was predicted with the VEGA consensus (Ames test) model.

VEGA outputs different SMILES than in input, therefore the input smiles are merged on index. Some random checks were made to test if the molecules encoded by the smiles in 'SMILES' and 'vegaSmiles' are the same and to verify that the merging was correct.

\* this dataframe is longer, as the prediction was made for the dataset previously to filtering out enols. This is done subsequently and the file is saved for import above.

In [17]:
all_OMG_ready_mutagenicity = pd.read_csv((Projectdir + '\\VEGA_muta_all_res\\ALLDB_OMG_ready_MUTA_CONSENSUS.txt'), 
                                         sep="\t")
all_OMG_ready_smilesin = pd.read_csv((Projectdir + '\\VEGA_muta_all_res\\ALLDB_OMG_ready_SMILES_title.txt'), 
                                         sep="\t")
all_OMG_ready_mutagenicity.rename(columns={"SMILES": "vegaSMILES"}, inplace=True)
mutagens = all_OMG_ready_mutagenicity[all_OMG_ready_mutagenicity['Predicted Consensus Mutagen activity']!= 'NON-Mutagenic']
#drawsmileslist(mutagens['SMILES'])
all_OMG_ready_mutagenicity = pd.merge(all_OMG_ready_smilesin,all_OMG_ready_mutagenicity, left_index=True, right_index=True)
all_OMG_ready_mutagenicity

Unnamed: 0,SMILES,No.,Id,vegaSMILES,Assessment,Used models,Predicted Consensus Mutagen activity,Mutagenic Score,Non-Mutagenic Score,Model Caesar assessment,Model ISS assessment,Model SarPy assessment,Model KNN assessment,Remarks
0,CCCCOP(=O)(OCC)OCCCC,1,Molecule 1,O=P(OCC)(OCCCC)OCCCC,NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,0.05,0.600,NON-Mutagenic (good reliability),Mutagenic (low reliability),Possible NON-Mutagenic (good reliability),NON-Mutagenic (moderate reliability),-
1,CCCCCOP(=O)(OC)OCCCC,2,Molecule 2,O=P(OC)(OCCCC)OCCCCC,NON-Mutagenic (Consensus score: 0.675),4,NON-Mutagenic,0.05,0.675,NON-Mutagenic (good reliability),Mutagenic (low reliability),NON-Mutagenic (good reliability),NON-Mutagenic (good reliability),-
2,CCCCCOP(=O)(OC)OCCCCC,3,Molecule 3,O=P(OC)(OCCCCC)OCCCCC,NON-Mutagenic (Consensus score: 0.75),4,NON-Mutagenic,0.00,0.750,NON-Mutagenic (good reliability),NON-Mutagenic (moderate reliability),NON-Mutagenic (good reliability),NON-Mutagenic (moderate reliability),-
3,CCCCCCOP(=O)(OC)OCCCC,4,Molecule 4,O=P(OC)(OCCCC)OCCCCCC,NON-Mutagenic (Consensus score: 0.675),4,NON-Mutagenic,0.05,0.675,NON-Mutagenic (good reliability),Mutagenic (low reliability),NON-Mutagenic (good reliability),NON-Mutagenic (good reliability),-
4,CCCCOP(=O)(OCCC)OCCCC,5,Molecule 5,O=P(OCCC)(OCCCC)OCCCC,NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,0.15,0.600,NON-Mutagenic (good reliability),Mutagenic (moderate reliability),Possible NON-Mutagenic (good reliability),NON-Mutagenic (moderate reliability),-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46008,CC(O)OP1(=O)OC(O)C#CC(O)O1,46009,Molecule 46009,O=P1(OC(O)C#CC(O)O1)OC(O)C,NON-Mutagenic (Consensus score: 0.5),4,NON-Mutagenic,0.00,0.500,NON-Mutagenic (moderate reliability),NON-Mutagenic (low reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-
46009,O=P12OC(O)C=C(CC(O)O1)C(O)O2,46010,Molecule 46010,O=P12(OC(O)C=C(CC(O)O1)C(O)O2),NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,0.00,0.600,NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-
46010,O=P12OC(O)C3C(C(O)O1)C3C(O)O2,46011,Molecule 46011,O=P12(OC(O)C3C(C(O)O1)C3(C(O)O2)),NON-Mutagenic (Consensus score: 0.6),4,NON-Mutagenic,0.00,0.600,NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),NON-Mutagenic (moderate reliability),-
46011,O=C(O)COP1(=O)OCC(=CO)CO1,46012,Molecule 46012,O=C(O)COP1(=O)(OCC(=CO)CO1),NON-Mutagenic (Consensus score: 0.35),4,NON-Mutagenic,0.05,0.350,Mutagenic (low reliability),NON-Mutagenic (moderate reliability),Possible NON-Mutagenic (moderate reliability),NON-Mutagenic (low reliability),-


In [19]:
#export to qsar_res_dir
#filteroutSmartsdf(all_OMG_ready_mutagenicity).to_csv(qsar_res_dir +'\\VEGAMutaCons_all_readys.txt', header=True, index=False, sep='\t')

# Annex 1b: VEGA model chunking

## Include additional properties
Include more properties predicted with VEGA models. <br> 
The initial dataset was too large to run on the VEGA platform, consequently the models were run on chunks of data and the results were concatenated, as shown below.

In [20]:
export_smiles_chunks = 'off'
if export_smiles_chunks == 'on':
    BigDF = smiles_biowin_ready_biodeg_noenols.copy()
    #Export SMILES codes as txt without enol structures - make chunks 
    BigDF = BigDF.reset_index() #reset index to save memory usage
    #chunk the SMILES
    n = 3000  #chunk row size
    list_BigDFchunks = [BigDF['SMILES'][i:i+n] for i in range(0,BigDF.shape[0],n)]
    number = 0
    for chunk in list_BigDFchunks:
        number += 1
        chunk.to_csv(str(Projectdir+'\\vega_addprop_chunks_res\\BigDF_chunks_input\\SMILES_without_enols'+str(number)+'.txt'), index=False, header=False)


#### Import predicted properties


In [21]:
#use report_summary for format, includes also other endpoints which were finally not used because of applicability. 
#import first chunk of data
additional_props_data = pd.read_csv(Projectdir+'\\vega_addprop_chunks_res\\vega_noenols\\1\\report_summary.txt', 
                        sep='\t', skiprows=29)
#append other chunks of data
for i in range(2,12):
    chunk = pd.read_csv(str(Projectdir+'\\vega_addprop_chunks_res\\vega_noenols\\'+str(i)+'\\report_summary.txt'), 
                        sep='\t', skiprows=29)
    additional_props_data = additional_props_data.append(chunk, ignore_index=True)
#VEGA unifies SMILES format and outputs other SMILES, rename to VEGA SMILES
additional_props_data.rename(columns={"SMILES":'vegaSMILES'}, inplace=True)
additional_props_data.shape

(32350, 55)

In [22]:
qsar_res_dir_vega = Projectdir+'\\other_qsar_results_merged_vega'

In [23]:
#extract the properties which were finally used in the MAUT assessment
VEGA_EDC_df = additional_props_data[['No.', 'Id', 'vegaSMILES',
                                'Estrogen Receptor-mediated effect (IRFMN/CERAPP) - assessment',
       'Estrogen Receptor-mediated effect (IRFMN/CERAPP) - prediction',
       'Androgen Receptor-mediated effect (IRFMN/COMPARA) - assessment',
       'Androgen Receptor-mediated effect (IRFMN/COMPARA) - prediction',
       'Thyroid Receptor Alpha effect (NRMEA) - assessment',
       'Thyroid Receptor Alpha effect (NRMEA) - prediction',
       'Thyroid Receptor Beta effect (NRMEA) - assessment',
       'Thyroid Receptor Beta effect (NRMEA) - prediction']]
VEGA_EDC_df.to_csv(qsar_res_dir_vega+'//VEGA_EDC_all_readys.txt', header=True, index=False, sep='\t')

In [24]:
VEGA_KOC_df = additional_props_data[['No.', 'Id', 'vegaSMILES',
                                 'KOC model (OPERA) - assessment',
       'KOC model (OPERA) - prediction [log(L/Kg)]'
                                ]]
VEGA_KOC_df.to_csv(qsar_res_dir_vega+'//VEGA_KOC_all_readys.txt', header=True, index=False, sep='\t')

In [25]:
VEGA_watsol_df = additional_props_data[['No.', 'Id', 'vegaSMILES',
                               'Water solubility model (IRFMN) - assessment',
                                'Water solubility model (IRFMN) - prediction [-log(mol/L)]']]
#to different dir because not used in final version
VEGA_watsol_df.to_csv(Projectdir+'\\other_qsar_res_watsol\\VEGA_watsol_all_readys.txt', header=True, index=False, sep='\t')




-------------------------------------------------------------------


