In [9]:
import numpy as np
import pandas as pd
import os

In [10]:
os.chdir('/02_smiles_input/02_oxidation_states')

# import SMILES for reference
smiles = 'oxidation_states.smi'
df1 = pd.read_csv(smiles, usecols=[0], names=['SMILES'], sep = '\t')

# import calculated InChIs
os.chdir('/03_inchi_output/02_oxidation_states_O')

CA = 'CAmolconvert_19.27.0_InChIs_oxidation_states.inchi'
df2 = pd.read_csv(CA, usecols=[0], names=['CAmolconvert_19.27.0'], sep = '\t')

OB = 'OpenBabel_3.1.1_InChIs_oxidation_states.inchi'
df3 = pd.read_csv(OB, usecols=[0], names=['OpenBabel_3.1.1'], sep = '\t')

rdkit = 'RDKit_2020.03.3_InChIs_oxidation_states.inchi'
df4 = pd.read_csv(rdkit, usecols=[0], names=['RDKit_2020.03.3'], sep = '\t')

cdraw = 'CDraw_cscript_19.01.28_InChIs_oxidation_states.inchi'
df5 = pd.read_csv(cdraw, usecols=[0], names=['CDraw_cscript_19.01.28'], sep = '\t')

In [11]:
# concatenate the InChI dataframes
df =  pd.concat([df2,df3,df4,df5], axis =1)

# Add index name
df.index.name = 'index'

# add a new column comparing the InChIs
# Here we are testing the condition that all InChIs need to match in order to be True
# This allows for comparison across all toolkits

# get location index of first column
df.iloc[:,0]

# check if individual columns are equal to first column
df.eq(df.iloc[:,0], axis=0)

# check if all columns are equal to first column, then add this as a new column named All_InChIs_Match
df['All_InChIs_Match?'] = df.eq(df.iloc[:,0], axis=0).all(1)

# Add the SMILES column for reference:
df_final =  pd.concat([df1,df], axis =1)
df_final

Unnamed: 0,SMILES,CAmolconvert_19.27.0,OpenBabel_3.1.1,RDKit_2020.03.3,CDraw_cscript_19.01.28,All_InChIs_Match?
0,[H-],InChI=1S/H/q-1,InChI=1S/H/q-1,InChI=1S/H/q-1,InChI=1S/H/q-1,True
1,[H-1],InChI=1S/H/q-1,InChI=1S/H/q-1,InChI=1S/H/q-1,InChI=1S/H/q-1,True
2,[H1-],InChI=1S//,ERROR,ERROR,Bad mimetype:chemical/x-inchi,False
3,[H+],InChI=1S/p+1,InChI=1S/p+1,InChI=1S/p+1,InChI=1S/p+1,True
4,[H+1],InChI=1S/p+1,InChI=1S/p+1,InChI=1S/p+1,InChI=1S/p+1,True
...,...,...,...,...,...,...
1542,[Hs+8],InChI=1S/Hs/q+8,InChI=1S/Hs/q+8,InChI=1S/Hs/q+8,InChI=1S/Hs/q+8,True
1543,[Hs8+],InChI=1S/Hs/q+1/i1-6,ERROR,ERROR,Bad mimetype:chemical/x-inchi,False
1544,[Cn++],InChI=1S/Cn/q+2,InChI=1S/Cn/q+2,InChI=1S/Cn/q+2,InChI=1S/Cn/q+2,True
1545,[Cn+2],InChI=1S/Cn/q+2,InChI=1S/Cn/q+2,InChI=1S/Cn/q+2,InChI=1S/Cn/q+2,True


In [12]:
# Export to csv (tab seperated file)
os.chdir('/04_inchi_analysis')
df_final.to_csv('oxidation_states_InChI_Comparison.tsv', sep ='\t', index=True)

In [None]:
# Display table and highlight rows that are False (i.e., InChIs that do not match)
# ! CAUTION !: If more than a couple thousand rows, I would not try this.

def highlight_style(row):
    color = 'white'
    if row.values[-1] == 0: #0 for False
        color = 'lightsalmon'
    return ['background-color: %s' % color]*len(row.values)

df_final.style.apply(highlight_style, axis=1)