In [15]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 5000)

In [16]:
os.chdir('/02_smiles_input/02_oxidation_states')

# import SMILES for reference
smiles = 'oxidation_states.smi'
df1 = pd.read_csv(smiles, usecols=[0], names=['SMILES'], sep = '\t')

# import calculated InChIs
os.chdir('/03_inchi_output/02_oxidation_states_O')

CA = 'CAmolconvert_19.27.0_InChIs_oxidation_states.inchi'
df2 = pd.read_csv(CA, usecols=[0], names=['CAmolconvert_19.27.0'], sep = '\t')

OB = 'OpenBabel_3.1.1_InChIs_oxidation_states.inchi'
df3 = pd.read_csv(OB, usecols=[0], names=['OpenBabel_3.1.1'], sep = '\t')

rdkit = 'RDKit_2020.03.3_InChIs_oxidation_states.inchi'
df4 = pd.read_csv(rdkit, usecols=[0], names=['RDKit_2020.03.3'], sep = '\t')

In [17]:
# concatenate the dataframes
df_all =  pd.concat([df1,df2,df3,df4], axis =1)
df_all.index.name = 'index'

# add a new column comparing the InChIs
# Here I am testing the condition that all InChIs need to match in order to be True
# This allows for comparison across all toolkits
df_all['All_InChIs_Match?'] = df_all['OpenBabel_3.1.1'].isin(df_all['CAmolconvert_19.27.0']) & df_all['RDKit_2020.03.3'].isin(df_all['CAmolconvert_19.27.0'])

# Display table and highlight rows that are False (i.e., InChIs that do not match)
def highlight_style(row):
    color = 'white'
    if row.values[-1] == 0: #0 for False
        color = 'lightsalmon'
    return ['background-color: %s' % color]*len(row.values)

df_all.style.apply(highlight_style, axis=1)

Unnamed: 0_level_0,SMILES,CAmolconvert_19.27.0,OpenBabel_3.1.1,RDKit_2020.03.3,All_InChIs_Match?
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,[H-],InChI=1S/H/q-1,InChI=1S/H/q-1,InChI=1S/H/q-1,True
1,[H-1],InChI=1S/H/q-1,InChI=1S/H/q-1,InChI=1S/H/q-1,True
2,[H1-],InChI=1S//,ERROR,ERROR,False
3,[H+],InChI=1S/p+1,InChI=1S/p+1,InChI=1S/p+1,True
4,[H+1],InChI=1S/p+1,InChI=1S/p+1,InChI=1S/p+1,True
5,[H1+],InChI=1S//,ERROR,ERROR,False
6,[He],InChI=1S/He,InChI=1S/He,InChI=1S/He,True
7,[Li+],InChI=1S/Li/q+1,InChI=1S/Li/q+1,InChI=1S/Li/q+1,True
8,[Li+1],InChI=1S/Li/q+1,InChI=1S/Li/q+1,InChI=1S/Li/q+1,True
9,[Li1+],InChI=1S/Li/q+1/i1-6,ERROR,ERROR,False


In [18]:
# Export to csv (tab seperated file)
df_all.to_csv('oxidation_states_InChI_Comparison.tsv', sep ='\t', index=True)

In [19]:
# might be useful to also compare only two toolkits

df_rdkit_OB = pd.concat([df1,df3,df4], axis =1)
df_rdkit_OB['InChIs_Match?'] = df_rdkit_OB['OpenBabel_3.1.1'] == df_rdkit_OB['RDKit_2020.03.3']
df_rdkit_OB.style.apply(highlight_style, axis=1)


Unnamed: 0,SMILES,OpenBabel_3.1.1,RDKit_2020.03.3,InChIs_Match?
0,[H-],InChI=1S/H/q-1,InChI=1S/H/q-1,True
1,[H-1],InChI=1S/H/q-1,InChI=1S/H/q-1,True
2,[H1-],ERROR,ERROR,True
3,[H+],InChI=1S/p+1,InChI=1S/p+1,True
4,[H+1],InChI=1S/p+1,InChI=1S/p+1,True
5,[H1+],ERROR,ERROR,True
6,[He],InChI=1S/He,InChI=1S/He,True
7,[Li+],InChI=1S/Li/q+1,InChI=1S/Li/q+1,True
8,[Li+1],InChI=1S/Li/q+1,InChI=1S/Li/q+1,True
9,[Li1+],ERROR,ERROR,True
