In [2]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def group_f1_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    pred = np.where(pred > 0.5, 1, 0)
    score = metrics.f1_score(label, pred, average='micro')
    return score

def group_kappa_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    pred = np.where(pred > 0.5, 1, 0)
    score = metrics.cohen_kappa_score(label, pred)
    return score

def group_acc_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    pred = np.where(pred > 0.5, 1, 0)
    score = metrics.accuracy_score(label, pred)
    return score

def group_auc_b(df): 
    label = np.array(df.Class_b.tolist(), dtype=int)
    pred = np.array(df.Pred_b.tolist(), dtype=float)
    score = metrics.roc_auc_score(label, pred)
    return score

In [4]:
from scipy.special import softmax

def binary_cls(cls):
    if cls == 0 or cls == 1:
        return 0
    elif cls == 2 or cls == 3:
        return 1
    else:
        return None
    
def binary_pred(pred):
    pred = np.array(pred.split(','), dtype=float)
    pred_b = np.array([pred[0]+pred[1], pred[2]+pred[3]])
    pred_b = softmax(pred_b)
    return pred_b[1] # the probability to greater label

## 1. Independent training

In [4]:
dfs = []
for mb in range(18): 
    for i in range(5):
        dfs.append(pd.read_csv('../results/molnet_chirality_cls_etkdg_csp{}-5fold_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0))
    df = pd.concat(dfs, ignore_index=True)

Convert quadra classification into binary classification: 

In [5]:
df['Class_b'] = df['Class'].apply(binary_cls)
df['Pred_b'] = df['Pred'].apply(binary_pred)

In [6]:
auc = df.groupby('MB').apply(group_auc_b)
acc = df.groupby('MB').apply(group_acc_b)
kappa = df.groupby('MB').apply(group_kappa_b)
f1 = df.groupby('MB').apply(group_f1_b)

In [7]:
print('AUC:', '\n'+'\n'.join(auc.astype(str).tolist()), '\n')
print('ACC:', '\n'+'\n'.join(acc.astype(str).tolist()), '\n')
print('KAPPA:', '\n'+'\n'.join(kappa.astype(str).tolist()), '\n')
print('F1:', '\n'+'\n'.join(f1.astype(str).tolist()), '\n')

AUC: 
0.9057808857808858
0.9033212329793456
0.8952841427940758
0.8891244590010527
0.9085682606820935
0.8904499178981937
0.8124318440678137
0.9491153856386517
0.7722950509082925
0.7927883862283661
0.8420768086239215
0.7496803696794311
0.9239179025198657
0.7925037490950462
0.8691141570741012
0.8216904625928041
0.8747088973076201
0.7923774039403078 

ACC: 
0.8956896551724138
0.8615384615384616
0.8739864864864865
0.8186813186813187
0.8808988764044944
0.8827868852459017
0.8618343195266273
0.9010309278350516
0.8664465538588526
0.8818359375
0.8766233766233766
0.8482758620689655
0.9161676646706587
0.847926267281106
0.86
0.93
0.8660869565217392
0.8237410071942446 

KAPPA: 
0.7546582765250831
0.7230127018944135
0.7190295822089393
0.6324771261987634
0.7208244336654708
0.694487251330905
0.6606887108155124
0.8018628551366821
0.626491209751056
0.6424498464805966
0.6825561379230536
0.5873568716821373
0.7988176882104412
0.5744582231001263
0.6863880215277598
0.6737019293020847
0.6952900723234433
0.5265

## 2. Transfer learning

In [8]:
dfs = []
for mb in range(18): 
    for i in range(5):
        dfs.append(pd.read_csv('../results/molnet_chirality_cls_etkdg_csp{}-5fold_tl_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0))
    df = pd.concat(dfs, ignore_index=True)

In [9]:
df['Class_b'] = df['Class'].apply(binary_cls)
df['Pred_b'] = df['Pred'].apply(binary_pred)

In [10]:
auc = df.groupby('MB').apply(group_auc_b)
acc = df.groupby('MB').apply(group_acc_b)
kappa = df.groupby('MB').apply(group_kappa_b)
f1 = df.groupby('MB').apply(group_f1_b)

In [11]:
print('AUC:', '\n'+'\n'.join(auc.astype(str).tolist()), '\n')
print('ACC:', '\n'+'\n'.join(acc.astype(str).tolist()), '\n')
print('KAPPA:', '\n'+'\n'.join(kappa.astype(str).tolist()), '\n')
print('F1:', '\n'+'\n'.join(f1.astype(str).tolist()), '\n')

AUC: 
0.9210753570130602
0.9056131448592486
0.8902443140231847
0.9222745675018635
0.9052515278290473
0.8986084018676626
0.8570461679887467
0.958475004655865
0.8035248213538786
0.8075753438708568
0.8378784453018445
0.7999471278990895
0.9239899080240069
0.8017578712901089
0.911137516237768
0.8398149568933673
0.8735790494991003
0.7831831038942306 

ACC: 
0.9120689655172414
0.8648351648351649
0.8881756756756757
0.8571428571428571
0.897003745318352
0.8860655737704918
0.8826923076923077
0.9206185567010309
0.8775897647544366
0.8916015625
0.8844155844155844
0.8651542649727768
0.9119760479041916
0.8682027649769585
0.8856
0.9408333333333333
0.8639130434782609
0.8273381294964028 

KAPPA: 
0.7951814319989198
0.7295527581124508
0.7482346411207845
0.7123012495318511
0.7576247922851898
0.7079972724830735
0.718306395420327
0.8408997715423818
0.6617757317894766
0.6739669193912581
0.6954538047906142
0.6443832468694796
0.7910129347405986
0.6356538177036246
0.7482543286171803
0.7394782225810002
0.69142152

### average on k-folds

In [5]:
dfs = []
for mb in range(18): 
    for i in range(5):
        df_tmp = pd.read_csv('../results/molnet_chirality_cls_etkdg_csp{}-5fold_tl_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0)
        df_tmp['k-fold'] = str(i)
        dfs.append(df_tmp)
    df = pd.concat(dfs, ignore_index=True)

In [7]:
df['Class_b'] = df['Class'].apply(binary_cls)
df['Pred_b'] = df['Pred'].apply(binary_pred)

In [8]:
df

Unnamed: 0,SMILES,MB,Class,Pred,k-fold,Class_b,Pred_b
0,C=CCc1ccccc1OC[C@@H](O)CNC(C)C,0,3,"0.0020866082049906254,0.014219977892935276,0.0...",0,1,0.724598
1,COc1ccc([C@H]2CCCn3nc(-c4ccc(-n5cnc(C)c5)c(OC)...,0,2,"0.0007065583486109972,0.0007496289908885956,0....",0,1,0.730486
2,COc1nc(-c2nc3n(n2)CCC[C@@H]3c2ccc(C(F)(F)F)cc2...,0,2,"0.001747628441080451,5.960829639661824e-06,0.9...",0,1,0.730368
3,C=C[C@@H](CNCc1ccc(OC)cc1)n1ccc2c([N+](=O)[O-]...,0,3,"3.287104846094735e-05,0.000531266734469682,0.0...",0,1,0.730837
4,Cc1ccsc1[C@@]1(C/C=C/OC(=O)C(C)(C)C)C(=O)N(C)c...,0,2,"0.00012049739598296583,0.0037947266828268766,0...",0,1,0.729516
...,...,...,...,...,...,...,...
76695,CN(C)CC[C@@H](c1ccc(Cl)cc1)c1ccccn1,17,3,"1.6406809777436138e-07,2.6435621691689448e-08,...",4,1,0.731059
76696,CN1C(=O)NC(=O)[C@@](C)(C2=CCCCC2)C1=O,17,2,"5.4317742979037575e-06,5.377025331654295e-07,0...",4,1,0.731056
76697,CN(C)CC[C@@H](c1ccc(Cl)cc1)c1ccccn1,17,2,"6.81766607613099e-08,5.547807990780029e-09,0.9...",4,1,0.731059
76698,O=C(c1ccccc1)[C@H](O)c1ccccc1,17,3,"5.6745166148175485e-06,1.878180592029821e-05,0...",4,1,0.731049


In [36]:
df_auc = df.groupby(by=['MB', 'k-fold']).apply(group_auc_b).reset_index().rename(columns={0: 'AUC'})

auc_mean = df_auc.groupby('MB').mean().values.tolist()
auc_std = df_auc.groupby('MB').std().values.tolist()

In [40]:
print('\n'.join([str(i[0]) for i in auc_mean]))

0.9250601391390202
0.9041683268770617
0.8896905097510794
0.9244999736547485
0.9077816747072822
0.9016841938547323
0.8585752757460312
0.9600295173500983
0.8071180588652981
0.8049177520094677
0.8411523999787441
0.8055916060120992
0.9222247254724574
0.7986084510713227
0.9122796311244061
0.8403872935505449
0.8711263769660306
0.7852354156074568


In [41]:
print('\n'.join([str(i[0]) for i in auc_std]))

0.01681808073407305
0.016136657388257692
0.008214448373404312
0.010425738955762674
0.012695074642096599
0.015861326904136934
0.014480809115812215
0.01145580442815428
0.04044148320672553
0.018247947271667255
0.026956135427280547
0.031847288801418544
0.015725483007665363
0.026007519187623673
0.020202134844326477
0.045438569925306045
0.016272970737815425
0.012074329937914997


In [45]:
df_kappa = df.groupby(by=['MB', 'k-fold']).apply(group_kappa_b).reset_index().rename(columns={0: 'KAPPA'})

kappa_mean = df_kappa.groupby('MB').mean().values.tolist()
kappa_std = df_kappa.groupby('MB').std().values.tolist()

In [46]:
print('\n'.join([str(i[0]) for i in kappa_mean]))

0.7937109321710841
0.7296646161692705
0.7483877998404702
0.7123817743682315
0.7574001998090558
0.706871045503451
0.7180663015089545
0.8405207328202995
0.659810124305045
0.6736078234214404
0.6952907185420049
0.6433896170433432
0.7890413680732126
0.6325636397451166
0.747183805259875
0.7379787496710118
0.6903303313021979
0.5357122656239633


In [47]:
print('\n'.join([str(i[0]) for i in kappa_std]))

0.029838111798518422
0.029876247211846754
0.022644851426632005
0.03849447606296199
0.008980164226781504
0.024893192521558114
0.022501109227369993
0.031874678875502466
0.06430171437291059
0.02225859412043273
0.014680761462882028
0.0470213040490641
0.021747170092645447
0.07031453816207404
0.04039498485248106
0.056090214128358684
0.040428319244924564
0.04393073087385678


In [48]:
df_f1 = df.groupby(by=['MB', 'k-fold']).apply(group_f1_b).reset_index().rename(columns={0: 'F1'})

f1_mean = df_f1.groupby('MB').mean().values.tolist()
f1_std = df_f1.groupby('MB').std().values.tolist()

In [49]:
print('\n'.join([str(i[0]) for i in f1_mean]))

0.9120689655172415
0.8648351648351648
0.8881756756756756
0.8571428571428571
0.897003745318352
0.8860655737704919
0.8826923076923077
0.9206185567010309
0.8775897647544367
0.8916015625
0.8844155844155844
0.8651542649727768
0.9119760479041916
0.8682027649769586
0.8855999999999999
0.9408333333333333
0.8639130434782608
0.8273381294964028


In [50]:
print('\n'.join([str(i[0]) for i in f1_std]))

0.010380684981717462
0.014845336358358573
0.011908484278388266
0.01942601047215788
0.007006849038902544
0.006734293739817642
0.00883868002739033
0.015824826219596647
0.018001745824903503
0.006291063831408753
0.010779133851072516
0.014369248822124267
0.0040168885823349725
0.019167404924502692
0.018242806801586242
0.011562030771259658
0.01526390407090615
0.017437697328835627


## ~~3. Input enantiomers~~

In [14]:
dfs = []
for mb in range(8): 
    for i in range(5):
        dfs.append(pd.read_csv('../results2/molnet_chirality_cls_etkdg_csp{}-5fold_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0))
    df = pd.concat(dfs, ignore_index=True)

In [15]:
df['Class_b'] = df['Class'].apply(binary_cls)
df['Pred_b'] = df['Pred'].apply(binary_pred)

In [16]:
auc = df.groupby('MB').apply(group_auc_b)
acc = df.groupby('MB').apply(group_acc_b)
kappa = df.groupby('MB').apply(group_kappa_b)
f1 = df.groupby('MB').apply(group_f1_b)

In [17]:
print('AUC:', '\n'+'\n'.join(auc.astype(str).tolist()), '\n')
print('ACC:', '\n'+'\n'.join(acc.astype(str).tolist()), '\n')
print('KAPPA:', '\n'+'\n'.join(kappa.astype(str).tolist()), '\n')
print('F1:', '\n'+'\n'.join(f1.astype(str).tolist()), '\n')

AUC: 
0.8522865365098017
0.8487269458299501
0.8562846850551635
0.8721774449141592
0.8462670277362209
0.8469771156263091
0.8085270198739234
0.9240780056934579 

ACC: 
0.853448275862069
0.8318681318681319
0.8621621621621621
0.8010989010989011
0.8550561797752809
0.8475409836065574
0.8448224852071006
0.8871134020618556 

KAPPA: 
0.6553604384542251
0.6633069099104763
0.6854698575280131
0.5995594692236252
0.6578240271915469
0.6024694126385716
0.6241043426856498
0.7737723772909767 

F1: 
0.853448275862069
0.8318681318681319
0.8621621621621621
0.8010989010989011
0.8550561797752809
0.8475409836065574
0.8448224852071006
0.8871134020618556 

