In [6]:
import pandas as pd
import numpy as np
import os

In [7]:
# convert probabilities to classes
def ProbToClass(df,model_type):
    classes = []
    for index, i in df.iterrows():
        classes.append((np.where(i==i.max())[0])[0])

    # add to original dataframe
    df[model_type+' prediction'] = classes
    return df

In [8]:
homedir = os.path.dirname(os.path.realpath('__file__'))
jobname = 'tox_niehs_'
taskname = 'logld50'
test_df = pd.read_csv(r'{}'.format(homedir+'/../data/'+jobname+'int_'+taskname+'.csv'))
test_df.head()

Unnamed: 0,id,smiles,logld50
0,molid70,ClC=CCCl,4.552809
1,molid597,O=C(O)CCC(=O)O,7.060025
2,molid1673,CC1CCCCN1N=O,6.39693
3,molid1818,NC(Cc1ccc(N(CCCl)CCCl)cc1)C(=O)O,3.673706
4,molid1841,CCN(CC)C(=S)S,7.178065


In [9]:
# MLP reference
ref_df = pd.read_csv(r'{}'.format(homedir+'/../data/'+jobname+'int_'+taskname+'_rdkit.csv'))
# MLP prediction result
result_df = pd.read_csv(r'{}'.format(homedir+'/../result/MLP/'+taskname+'_test_results.csv'), header=None)
if (len(result_df.columns) != 1):
    result_df = ProbToClass(result_df, 'mlp')

In [10]:
# add id to the result
result_df['id'] = ref_df['id']
print(result_df.shape)
result_df.head()

(627, 2)


Unnamed: 0,0,id
0,5.713997,molid70
1,8.307622,molid597
2,6.526076,molid1673
3,5.953615,molid1818
4,6.19845,molid1841


In [11]:
# merge mlp prediction
combined_df = pd.merge(test_df, result_df, how="left", on=["id"])
combined_df.head()

Unnamed: 0,id,smiles,logld50,0
0,molid70,ClC=CCCl,4.552809,5.713997
1,molid597,O=C(O)CCC(=O)O,7.060025,8.307622
2,molid1673,CC1CCCCN1N=O,6.39693,6.526076
3,molid1818,NC(Cc1ccc(N(CCCl)CCCl)cc1)C(=O)O,3.673706,5.953615
4,molid1841,CCN(CC)C(=S)S,7.178065,6.19845


In [None]:
# RNN reference
ref_rnn_df = pd.read_csv(r'{}'.format(homedir+'/../data/'+jobname+'int_'+taskname+'_smiles.csv'))
result_rnn_df = pd.read_csv(r'{}'.format(homedir+'/../result/RNN/predictions_'+jobname+taskname+'.csv'), header=None)
result_rnn_df = ProbToClass(result_rnn_df, 'rnn')

result_rnn_df['id'] = ref_rnn_df['id']
print(result_rnn_df.shape)
result_rnn_df.head()

In [None]:
# merge rnn prediction
combined_df = pd.merge(combined_df, result_rnn_df, how="left", on=["id"])
combined_df.head()

In [14]:
# CNN reference
ref_cnn_df = pd.read_csv(r'{}'.format(homedir+'/../data/'+jobname+'int_'+taskname+'_image.csv'))
result_cnn_df = pd.read_csv(r'{}'.format(homedir+'/../result/CNN/predictions_'+jobname+taskname+'.csv'), header=None)
if (len(result_cnn_df.columns) != 1):
    result_cnn_df = ProbToClass(result_cnn_df, 'cnn')

result_cnn_df['id'] = ref_cnn_df['id']
print(result_cnn_df.shape)
result_cnn_df.head()

(622, 2)


Unnamed: 0,0,id
0,5.38323,molid70
1,7.20584,molid597
2,6.99694,molid1673
3,6.44761,molid1818
4,5.6011,molid1841


In [15]:
# merge cnn prediction
combined_df = pd.merge(combined_df, result_cnn_df, how="left", on=["id"])
combined_df.head()

Unnamed: 0,id,smiles,logld50,0_x,0_y
0,molid70,ClC=CCCl,4.552809,5.713997,5.38323
1,molid597,O=C(O)CCC(=O)O,7.060025,8.307622,7.20584
2,molid1673,CC1CCCCN1N=O,6.39693,6.526076,6.99694
3,molid1818,NC(Cc1ccc(N(CCCl)CCCl)cc1)C(=O)O,3.673706,5.953615,6.44761
4,molid1841,CCN(CC)C(=S)S,7.178065,6.19845,5.6011


In [16]:
combined_df.isnull().values.any()

True

In [17]:
fileout=homedir+'/../result/'+jobname+'int_'+taskname+'_final.csv'
combined_df.to_csv(fileout, index=False)