In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
# suppress rdkit warning
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
def group_f1_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    pred = np.where(pred > 0.5, 1, 0)
    score = metrics.f1_score(label, pred, average='micro')
    return score

def group_kappa_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    pred = np.where(pred > 0.5, 1, 0)
    score = metrics.cohen_kappa_score(label, pred)
    return score

def group_acc_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    pred = np.where(pred > 0.5, 1, 0)
    score = metrics.accuracy_score(label, pred)
    return score

def group_auc_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    score = metrics.roc_auc_score(label, pred)
    return score

In [3]:
from scipy.special import softmax

def binary_cls(cls):
    if cls == 0 or cls == 1:
        return 0
    elif cls == 2 or cls == 3:
        return 1
    else:
        return None
    
def binary_pred(pred):
    pred = np.array(pred.split(','), dtype=float)
    pred_b = np.array([pred[0]+pred[1], pred[2]+pred[3]])
    pred_b = softmax(pred_b)
    return pred_b[1] # the probability to greater label

## 3DMolCSP-TL

In [4]:
dfs = []
for mb in range(18): 
    for i in range(5):
        dfs.append(pd.read_csv('../results0804/molnet_chirality_cls_etkdg_csp{}-5fold_tl_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0))
df = pd.concat(dfs, ignore_index=True)

In [5]:
df

Unnamed: 0,SMILES,MB,Class,Pred
0,NS(=O)(=O)c1cc2c(cc1C(F)(F)F)N[C@H](Cc1ccccc1)...,0,2,"4.572155773985287e-07,1.3657396102928487e-08,0..."
1,COc1ccc(N[C@@H](C)c2cc(C(C)C)nc(C(C)C)c2)cc1,0,2,"5.959157078905264e-08,3.0855302757260006e-09,1..."
2,CCOC(=O)C[C@H](Nc1ccc(OC)cc1)c1ccc(OC)cc1,0,1,"0.0001905889657791704,0.15719041228294373,0.84..."
3,CCCN(CCC)C(=O)[C@H](CCC(=O)O)NC(=O)c1ccccc1,0,2,"1.76505795934645e-06,3.0357352898136014e-07,0...."
4,O[C@H](Cc1nccc2ccccc12)c1ccccc1,0,2,"9.149026425347984e-08,5.1062514216937416e-09,0..."
...,...,...,...,...
87485,CS(=O)(=O)N1CC2(CCN(C(=O)[C@@H](N)COCc3ccccc3)...,17,2,"0.07357463985681534,0.0001051722647389397,0.92..."
87486,CCOC(=O)C1=C(C)Nc2ccnc(OC(C)C)c2[C@@H]1c1ccccc...,17,2,"2.6112607542927435e-07,1.1215218087556877e-08,..."
87487,O=C(Cc1ccc([N+](=O)[O-])cc1)NC[C@@H](O)c1cccnc1,17,0,"2.7142937142343726e-07,4.3322143028490245e-06,..."
87488,CC(C)(C)C(=O)N[C@H](COCc1ccccc1)C(=O)N1CCC2(CC...,17,1,"0.7480751872062683,0.0006561278132721782,0.017..."


In [6]:
df = df.rename(columns={'SMILES': 'SMILES_iso'})
df['SMILES'] = df['SMILES_iso'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False))

df['MB'] = df['MB'].astype(int)
df['Class_b'] = df['Class'].apply(binary_cls)
df['Pred_b'] = df['Pred'].apply(binary_pred)
df['Chir_tag'] = df['SMILES_iso'].apply(lambda x: ''.join(['0' if str(atom.GetChiralTag()) == 'CHI_UNSPECIFIED' else '1' for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [7]:
df

Unnamed: 0,SMILES_iso,MB,Class,Pred,SMILES,Class_b,Pred_b,Chir_tag
0,NS(=O)(=O)c1cc2c(cc1C(F)(F)F)N[C@H](Cc1ccccc1)...,0,2,"4.572155773985287e-07,1.3657396102928487e-08,0...",NS(=O)(=O)c1cc2c(cc1C(F)(F)F)NC(Cc1ccccc1)NS2(...,1,0.731058,000000000000000100000000000
1,COc1ccc(N[C@@H](C)c2cc(C(C)C)nc(C(C)C)c2)cc1,0,2,"5.959157078905264e-08,3.0855302757260006e-09,1...",COc1ccc(NC(C)c2cc(C(C)C)nc(C(C)C)c2)cc1,1,0.731059,00000001000000000000000
2,CCOC(=O)C[C@H](Nc1ccc(OC)cc1)c1ccc(OC)cc1,0,1,"0.0001905889657791704,0.15719041228294373,0.84...",CCOC(=O)CC(Nc1ccc(OC)cc1)c1ccc(OC)cc1,0,0.664907,000000100000000000000000
3,CCCN(CCC)C(=O)[C@H](CCC(=O)O)NC(=O)c1ccccc1,0,2,"1.76505795934645e-06,3.0357352898136014e-07,0....",CCCN(CCC)C(=O)C(CCC(=O)O)NC(=O)c1ccccc1,1,0.731058,000000000100000000000000
4,O[C@H](Cc1nccc2ccccc12)c1ccccc1,0,2,"9.149026425347984e-08,5.1062514216937416e-09,0...",OC(Cc1nccc2ccccc12)c1ccccc1,1,0.731059,0100000000000000000
...,...,...,...,...,...,...,...,...
87485,CS(=O)(=O)N1CC2(CCN(C(=O)[C@@H](N)COCc3ccccc3)...,17,2,"0.07357463985681534,0.0001051722647389397,0.92...",CS(=O)(=O)N1CC2(CCN(C(=O)C(N)COCc3ccccc3)CC2)c...,1,0.701121,0000000000001000000000000000000
87486,CCOC(=O)C1=C(C)Nc2ccnc(OC(C)C)c2[C@@H]1c1ccccc...,17,2,"2.6112607542927435e-07,1.1215218087556877e-08,...",CCOC(=O)C1=C(C)Nc2ccnc(OC(C)C)c2C1c1ccccc1C(F)...,1,0.731058,000000000000000000010000000000
87487,O=C(Cc1ccc([N+](=O)[O-])cc1)NC[C@@H](O)c1cccnc1,17,0,"2.7142937142343726e-07,4.3322143028490245e-06,...",O=C(Cc1ccc([N+](=O)[O-])cc1)NCC(O)c1cccnc1,0,0.731057,0000000000000010000000
87488,CC(C)(C)C(=O)N[C@H](COCc1ccccc1)C(=O)N1CCC2(CC...,17,1,"0.7480751872062683,0.0006561278132721782,0.017...",CC(C)(C)C(=O)NC(COCc1ccccc1)C(=O)N1CCC2(CC1)CN...,0,0.378137,0000000100000000000000000000000000000


In [8]:
df_uniq = df.sort_values(['SMILES_iso', 'Chir_tag', 'MB', 'Class'], ascending=False).drop_duplicates(['SMILES_iso', 'Chir_tag', 'MB'], keep='first').sort_index()

df_uniq = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB']).filter(lambda x: len(x) == 2)

In [9]:
df_pred = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB'])['Pred_b'].mean().to_frame(name='Pred_b').reset_index()

df_pred = df_pred.merge(df[['SMILES', 'MB', 'Chir_tag', 'Class_b']], on=['SMILES', 'MB', 'Chir_tag'], how='left').drop_duplicates(['SMILES', 'MB', 'Chir_tag'], keep='first').sort_index()

In [10]:
df_pred

Unnamed: 0,SMILES,Chir_tag,MB,Pred_b,Class_b
0,BC(C=C)CO,010000,14,0.269028,0
2,B[PH](c1ccccc1)(c1ccccc1)C(CCO)CCCOC(=O)c1ccccc1,000000000000001000000000000000,11,0.269057,0
4,B[PH](c1ccccc1)(c1ccccc1)C(CCO)CCCOCc1ccccc1,00000000000000100000000000000,8,0.269037,0
6,B[PH](c1ccccc1)(c1ccccc1)C(CCO)c1ccc(Br)cc1,0000000000000010000000000,11,0.269304,0
8,B[PH](c1ccccc1)(c1ccccc1)C(CCO)c1ccc(Cl)cc1,0000000000000010000000000,8,0.730615,1
...,...,...,...,...,...
86449,c1coc(C2c3[nH]c4ccccc4c3CCN2Cc2cccc3ccccc23)c1,00001000000000000000000000000,11,0.731005,1
86451,c1coc(COCC2CO2)c1,00000001000,10,0.269505,0
86453,c1csc(C2CCCO2)c1,0000100000,6,0.730757,1
86455,c1csc(C2CNc3ccccc3C2)c1,000010000000000,11,0.731022,1


In [11]:
auc = df_pred.groupby('MB').apply(group_auc_b)
acc = df_pred.groupby('MB').apply(group_acc_b)
kappa = df_pred.groupby('MB').apply(group_kappa_b)
f1 = df_pred.groupby('MB').apply(group_f1_b)

In [12]:
print('AUC:', '\n'+'\n'.join(auc.astype(str).tolist()), '\n')
print('ACC:', '\n'+'\n'.join(acc.astype(str).tolist()), '\n')
print('KAPPA:', '\n'+'\n'.join(kappa.astype(str).tolist()), '\n')
print('F1:', '\n'+'\n'.join(f1.astype(str).tolist()), '\n')

AUC: 
0.9617251615156277
0.9622641509433962
0.9414327312434597
0.9743961352657005
0.8276269618198007
0.9671814671814671
0.8264316471487785
0.946553073869754
0.8546116677184252
0.8507207013875169
0.8446345558660269
0.8425703539942732
0.9744289044289044
0.869784145321882
0.8392757409242411
0.7686915887850467
0.8611327478681431
0.9140583554376658 

ACC: 
0.9412915851272016
0.9311475409836065
0.9258098223615465
0.9164265129682997
0.8872814948764316
0.9243856332703214
0.8844030365769496
0.9075757575757576
0.9146845915201655
0.9149056072132995
0.879129734085415
0.9022317676720516
0.9584245076586433
0.9202127659574468
0.8610271903323263
0.9202551834130781
0.8556311413454271
0.9184782608695652 

KAPPA: 
0.8644658202779645
0.8409288463926488
0.8055095188616868
0.8237313668132215
0.7097623429099077
0.8378991236134093
0.6681515298808212
0.8151298705771337
0.7408768332929367
0.7284205693296603
0.6921094677785902
0.7146270618854316
0.8990266649611015
0.7402570820370067
0.6659630066595814
0.59839614

### average on k-folds

In [13]:
dfs = []
for mb in range(18): 
    for i in range(5):
        df_tmp = pd.read_csv('../results0804/molnet_chirality_cls_etkdg_csp{}-5fold_tl_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0)
        df_tmp['k-fold'] = str(i)
        dfs.append(df_tmp)
df = pd.concat(dfs, ignore_index=True)

In [14]:
df = df.rename(columns={'SMILES': 'SMILES_iso'})
df['SMILES'] = df['SMILES_iso'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False))

df['MB'] = df['MB'].astype(int)
df['Class_b'] = df['Class'].apply(binary_cls)
df['Pred_b'] = df['Pred'].apply(binary_pred)
df['Chir_tag'] = df['SMILES_iso'].apply(lambda x: ''.join(['0' if str(atom.GetChiralTag()) == 'CHI_UNSPECIFIED' else '1' for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [15]:
df_uniq = df.sort_values(['SMILES_iso', 'Chir_tag', 'MB', 'Class'], ascending=False).drop_duplicates(['SMILES_iso', 'Chir_tag', 'MB'], keep='first').sort_index()

df_uniq = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB', 'k-fold']).filter(lambda x: len(x) == 2)

In [16]:
df_pred = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB', 'k-fold'])['Pred_b'].mean().to_frame(name='Pred_b').reset_index()

df_pred = df_pred.merge(df[['SMILES', 'MB', 'Chir_tag', 
                            'Class_b', 'k-fold']], 
                        on=['SMILES', 'MB', 'Chir_tag', 'k-fold'], 
                        how='left').drop_duplicates(['SMILES', 'MB', 'Chir_tag', 'k-fold'], keep='first').sort_index()

In [17]:
df_pred

Unnamed: 0,SMILES,Chir_tag,MB,k-fold,Pred_b,Class_b
0,BC(C=C)CO,010000,14,2,0.269028,0
2,B[PH](c1ccccc1)(c1ccccc1)C(CCO)CCCOC(=O)c1ccccc1,000000000000001000000000000000,11,1,0.269057,0
4,B[PH](c1ccccc1)(c1ccccc1)C(CCO)CCCOCc1ccccc1,00000000000000100000000000000,8,0,0.269037,0
6,B[PH](c1ccccc1)(c1ccccc1)C(CCO)c1ccc(Br)cc1,0000000000000010000000000,11,3,0.269304,0
8,B[PH](c1ccccc1)(c1ccccc1)C(CCO)c1ccc(Cl)cc1,0000000000000010000000000,8,2,0.730615,1
...,...,...,...,...,...,...
83459,c1coc(C2c3[nH]c4ccccc4c3CCN2Cc2cccc3ccccc23)c1,00001000000000000000000000000,11,3,0.731005,1
83461,c1coc(COCC2CO2)c1,00000001000,10,4,0.269505,0
83463,c1csc(C2CCCO2)c1,0000100000,6,4,0.730757,1
83465,c1csc(C2CNc3ccccc3C2)c1,000010000000000,11,3,0.731022,1


In [18]:
df_auc = df_pred.groupby(by=['MB', 'k-fold']).apply(group_auc_b).reset_index().rename(columns={0: 'AUC'})

auc_mean = df_auc.groupby('MB').mean().values.tolist()
auc_std = df_auc.groupby('MB').std().values.tolist()

In [19]:
df_auc[df_auc['MB'] == 6]

Unnamed: 0,MB,k-fold,AUC
30,6,0,0.802812
31,6,1,0.842787
32,6,2,0.820673
33,6,3,0.810746
34,6,4,0.759827


In [20]:
df_auc[df_auc['MB'] == 8]

Unnamed: 0,MB,k-fold,AUC
40,8,0,0.796296
41,8,1,0.857127
42,8,2,0.875394
43,8,3,0.887624
44,8,4,0.8935


In [21]:
df_auc[df_auc['MB'] == 9]

Unnamed: 0,MB,k-fold,AUC
45,9,0,0.859092
46,9,1,0.826296
47,9,2,0.849275
48,9,3,0.842486
49,9,4,0.879651


In [22]:
df_auc[df_auc['MB'] == 11]

Unnamed: 0,MB,k-fold,AUC
55,11,0,0.790281
56,11,1,0.845116
57,11,2,0.859869
58,11,3,0.880961
59,11,4,0.849971


In [23]:
df_auc[df_auc['MB'] == 15]

Unnamed: 0,MB,k-fold,AUC
75,15,0,0.728938
76,15,1,0.738095
77,15,2,0.794699
78,15,3,0.756892
79,15,4,0.762547


In [24]:
print('\n'.join([str(i[0]) for i in auc_mean]))

0.9621533912209749
0.9667739656319295
0.9372329793544646
0.9749472766884532
0.8428435660799947
0.967979452286647
0.8073689127973488
0.950098019898704
0.8619880074093447
0.8513600904912524
0.8538190278409996
0.8452394619890619
0.9730057603558295
0.8834638210496848
0.8433050387714832
0.7562342860025406
0.8647268410503945
0.9147537959415354


In [25]:
print('\n'.join([str(i[0]) for i in auc_std]))

0.022877010075485878
0.013744550790776295
0.03170045493641074
0.020399938909964884
0.058339103630638746
0.012915498802939348
0.030511836315174106
0.011691824755095401
0.03927160683416901
0.019820922926568858
0.01921187510900727
0.033663499087397894
0.016644068874931604
0.023000400471895253
0.025437949189077935
0.02546540910665979
0.016564837804924543
0.06586924513523518


In [26]:
df_kappa = df_pred.groupby(by=['MB', 'k-fold']).apply(group_kappa_b).reset_index().rename(columns={0: 'KAPPA'})

kappa_mean = df_kappa.groupby('MB').mean().values.tolist()
kappa_std = df_kappa.groupby('MB').std().values.tolist()

In [27]:
print('\n'.join([str(i[0]) for i in kappa_mean]))

0.8550321263608038
0.8379789549486679
0.7970528905280568
0.8201888021558243
0.7076805048853556
0.8391921548002234
0.646687533089314
0.8144344668018368
0.739729808380359
0.7280977061093663
0.6909101383409063
0.7139717278299995
0.8984239780498818
0.7391772632500497
0.6627964971310885
0.5891216259668177
0.6766050624927743
0.7211464264034161


In [28]:
print('\n'.join([str(i[0]) for i in kappa_std]))

0.017648762818939276
0.05373548784219434
0.043778800773836266
0.08584180668805913
0.07261861848346549
0.07657993864186581
0.07700525007359908
0.028132270034524585
0.055835950241770224
0.0224346470487834
0.04975993034741242
0.053701085368350485
0.04718065747478901
0.1243357333135869
0.05123662672762443
0.08982534790930957
0.02824768052468486
0.13600419510823344


In [29]:
df_f1 = df_pred.groupby(by=['MB', 'k-fold']).apply(group_f1_b).reset_index().rename(columns={0: 'F1'})

f1_mean = df_f1.groupby('MB').mean().values.tolist()
f1_std = df_f1.groupby('MB').std().values.tolist()

In [30]:
print('\n'.join([str(i[0]) for i in f1_mean]))

0.9380309906031169
0.9311475409836065
0.9240492078186643
0.9164389233954452
0.8875188106426396
0.9244025157232704
0.8796972466928524
0.9075334502112324
0.9149583606778722
0.9149298828073544
0.8791069989105018
0.9021129434940199
0.9583850931677018
0.9202216780290836
0.8605032212540007
0.9201864234712597
0.8557448349551515
0.9183183183183183


In [31]:
print('\n'.join([str(i[0]) for i in f1_std]))

0.011612785771124111
0.02137443411541846
0.012052055854406333
0.038694095747378005
0.025050644158528184
0.039428495102140824
0.020288309208696922
0.013641966374607871
0.01487033479850488
0.005847646322625446
0.01782227554263699
0.01777232540896637
0.022562544297152494
0.0376696426777336
0.017168165105943864
0.013111293288619872
0.016265773251866938
0.04322655667374553
