In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import preprocessing
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
def pred2cls_df(df):
    pred = np.array([np.array(i.split(','), dtype=float) for i in df.Pred.tolist()])
    pred = np.reshape(pred, (len(df), 2, -1))
    pred = np.mean(pred, axis=1, keepdims=False)
    pred = np.argmax(pred, axis=1)
    return pred

def pred2prob_df(df): 
    pred = np.array([np.array(i.split(','), dtype=float) for i in df.Pred.tolist()])
    pred = np.reshape(pred, (len(df), 2, -1))
    pred = np.mean(pred, axis=1, keepdims=False)
    return pred

def group_f1(df): 
    label = np.array(df.Class.tolist(), dtype=int)
    pred = pred2cls_df(df)
    score = metrics.f1_score(label, pred, average='micro')
    return score

def group_kappa(df): 
    label = np.array(df.Class.tolist(), dtype=int)
    pred = pred2cls_df(df)
    score = metrics.cohen_kappa_score(label, pred)
    return score

def group_acc(df): 
    label = np.array(df.Class.tolist(), dtype=int)
    pred = pred2cls_df(df)
    score = metrics.accuracy_score(label, pred)
    return score

def group_auc(df): 
    label = np.array(df.Class.tolist(), dtype=int)
    pred = pred2prob_df(df)
    # score = metrics.roc_auc_score(label, pred, multi_class='ovr', average='weighted', labels=[0, 1, 2, 3])
    score = metrics.roc_auc_score(label, pred, multi_class='ovr', average='macro', labels=[0, 1, 2, 3])
    
#     # calculate AUC one-by-one (same to the previous results)
#     lb = preprocessing.LabelBinarizer()
#     class_oh = lb.fit_transform(df.Class)
#     pred = [np.array(i.split(','), dtype=float) for i in df.Pred.tolist()]
#     pred_prob = np.array(pred)
#     # print(class_oh.shape, pred_prob.shape)
#     aucs = []
#     for c in range(4): 
#         fpr, tpr, thresh = metrics.roc_curve(class_oh[:, c], pred_prob[:, c])
#         aucs.append(metrics.roc_auc_score(class_oh[:, c], pred_prob[:, c]))

#     # calculate binary accuracy (AUC sometimes smaller than binary accuracy)
#     cls = df.Class.to_numpy()
#     class_oh = np.zeros((cls.size, 4))
#     class_oh[np.arange(cls.size), cls] = 1
#     pred = np.argmax([np.array(i.split(','), dtype=float) for i in df.Pred.tolist()], axis=1)
#     pred_oh = np.zeros((pred.size, 4))
#     pred_oh[np.arange(pred.size), pred] = 1
#     accs = []
#     for c in range(4):
#         print(class_oh[:, c].shape, pred_oh[:, c].shape)
#         accs.append(metrics.accuracy_score(class_oh[:, c], pred_oh[:, c]))
        
    return score#, np.mean(np.array(accs))#, np.mean(np.array(aucs))

## 1. Independent training

In [21]:
dfs = []
for mb in range(18): 
    for i in range(5):
        dfs.append(pd.read_csv('../results0804/molnet_chirality_cls_etkdg_csp{}-5fold_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0))
df = pd.concat(dfs, ignore_index=True)
    
# print('AUC:', df.groupby('MB').apply(group_auc), '\n')
# print('ACC:', df.groupby('MB').apply(group_acc), '\n')
# print('KAPPA:', df.groupby('MB').apply(group_kappa), '\n')
# print('F1:', df.groupby('MB').apply(group_f1), '\n')

In [22]:
df = df.rename(columns={'SMILES': 'SMILES_iso'})
df['SMILES'] = df['SMILES_iso'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False))

df['MB'] = df['MB'].astype(int)
df['Chir_tag'] = df['SMILES_iso'].apply(lambda x: ''.join(['0' if str(atom.GetChiralTag()) == 'CHI_UNSPECIFIED' else '1' for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [23]:
df_uniq = df.sort_values(['SMILES_iso', 'Chir_tag', 'MB', 'Class'], ascending=False).drop_duplicates(['SMILES_iso', 'Chir_tag', 'MB'], keep='first').sort_index()

df_uniq = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB']).filter(lambda x: len(x) == 2)

In [24]:
df_pred = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB'])['Pred'].apply(lambda x: ','.join(x)).to_frame(name='Pred').reset_index()

df_pred = df_pred.merge(df[['SMILES', 'MB', 'Chir_tag', 'Class']], on=['SMILES', 'MB', 'Chir_tag'], how='left').drop_duplicates(['SMILES', 'MB', 'Chir_tag'], keep='first').sort_index()

In [38]:
auc = df_pred.groupby('MB').apply(group_auc)
acc = df_pred.groupby('MB').apply(group_acc)
kappa = df_pred.groupby('MB').apply(group_kappa)
f1 = df_pred.groupby('MB').apply(group_f1)

In [39]:
print('AUC:', '\n'+'\n'.join(auc.astype(str).tolist()), '\n')
print('ACC:', '\n'+'\n'.join(acc.astype(str).tolist()), '\n')
print('KAPPA:', '\n'+'\n'.join(kappa.astype(str).tolist()), '\n')
print('F1:', '\n'+'\n'.join(f1.astype(str).tolist()), '\n')

AUC: 
0.8645877276039586
0.9363571463847923
0.8661955168052251
0.9690625024921604
0.8357262488227997
0.8948527817547486
0.5556805409229045
0.8668558595336116
0.8270205478796306
0.8825061441620783
0.6902781645076491
0.7919143664540496
0.9204248822105785
0.7983983409862165
0.7749143905827255
0.6542733961903608
0.7639620290609388
0.9102287071962442 

ACC: 
0.8316831683168316
0.839344262295082
0.8034934497816594
0.8703170028818443
0.786618444846293
0.8487712665406427
0.6566000974184121
0.7560606060606061
0.8607725829086343
0.8867286559594252
0.7026591458501209
0.8537678588103464
0.8533916849015317
0.8138297872340425
0.7593152064451159
0.8086124401913876
0.7762660619803476
0.8315217391304348 

KAPPA: 
0.6972849083215797
0.7663566012663175
0.7061902757962372
0.8140584438781585
0.6437287979306294
0.7603244002446594
0.03328046214505864
0.6376124574554434
0.6997132858349318
0.7523997748738611
0.40963425167919343
0.6768267572424899
0.778785229711082
0.6300507259051951
0.5530859547864077
0.326530

In [6]:
# def process_prob(x): 
#     x = np.array(x.split(','), dtype=float)
#     return x

# def plot_roc_curve(mb_idx, save_fig=False, print_confusion_metrics=False):
#     lb = preprocessing.LabelBinarizer()
#     if mb_idx == 'all':
#         df_tmp = df
#     else: 
#         df_tmp = df[df['MB'] == mb_idx]
        
#     class_oh = lb.fit_transform(df_tmp['Class'])
#     pred_prob = df_tmp['Pred'].apply(process_prob)
#     pred_prob = np.array(pred_prob.tolist())

#     f, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 9), gridspec_kw={'height_ratios': [0.6, 2]})

#     sns.countplot(x=df_tmp["Class"], ax=ax1)

#     for c in range(4): 
#         fpr, tpr, thresh = metrics.roc_curve(class_oh[:, c], pred_prob[:, c])
#         auc = metrics.roc_auc_score(class_oh[:, c], pred_prob[:, c])
#         ax2.plot(fpr, tpr, label="class {} vs the rest (AUC={:.2f})".format(c, auc))

#     ax2.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
#     ax2.axis("square")
#     ax2.set_xlabel("False Positive Rate")
#     ax2.set_ylabel("True Positive Rate")
#     ax2.set_title("One-vs-Rest ROC curves (encoded csp: {})".format(mb_idx))
#     ax2.legend()

#     plt.subplots_adjust(hspace=.3)
#     if save_fig: 
#         plt.savefig('./roc_curve_{}.png'.format(str(mb_idx)), dpi=300, bbox_inches='tight')
#         print('Save!')
#     plt.show()
    
#     # confusion metrics
#     if print_confusion_metrics: 
#         pred = np.argmax(pred_prob, axis=1)
#         print('confusion metrics: \n[[tn, fp], \n[fn, tp]]\n')
#         print(metrics.multilabel_confusion_matrix(df_tmp['Class'].to_numpy(), pred))

In [7]:
# plot_roc_curve('all', save_fig=True)

In [8]:
# plot_roc_curve(2, save_fig=True)

## 2. Transfer learning

In [3]:
dfs = []
for mb in range(18): 
    for i in range(5):
        dfs.append(pd.read_csv('../results0804/molnet_chirality_cls_etkdg_csp{}-5fold_tl_{}.csv'.format(str(mb), str(i)), 
                               sep='\t', index_col=0))
df = pd.concat(dfs, ignore_index=True)

In [4]:
df = df.rename(columns={'SMILES': 'SMILES_iso'})
df['SMILES'] = df['SMILES_iso'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False))

df['MB'] = df['MB'].astype(int)
df['Chir_tag'] = df['SMILES_iso'].apply(lambda x: ''.join(['0' if str(atom.GetChiralTag()) == 'CHI_UNSPECIFIED' else '1' for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [5]:
df_uniq = df.sort_values(['SMILES_iso', 'Chir_tag', 'MB', 'Class'], ascending=False).drop_duplicates(['SMILES_iso', 'Chir_tag', 'MB'], keep='first').sort_index()

df_uniq = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB']).filter(lambda x: len(x) == 2)

In [6]:
df_pred = df_uniq.groupby(by=['SMILES', 'Chir_tag', 'MB'])['Pred'].apply(lambda x: ','.join(x)).to_frame(name='Pred').reset_index()

df_pred = df_pred.merge(df[['SMILES', 'MB', 'Chir_tag', 'Class']], on=['SMILES', 'MB', 'Chir_tag'], how='left').drop_duplicates(['SMILES', 'MB', 'Chir_tag'], keep='first').sort_index()

In [7]:
auc = df_pred.groupby('MB').apply(group_auc)
acc = df_pred.groupby('MB').apply(group_acc)
kappa = df_pred.groupby('MB').apply(group_kappa)
f1 = df_pred.groupby('MB').apply(group_f1)

In [8]:
print('AUC:', '\n'+'\n'.join(auc.astype(str).tolist()), '\n')
print('ACC:', '\n'+'\n'.join(acc.astype(str).tolist()), '\n')
print('KAPPA:', '\n'+'\n'.join(kappa.astype(str).tolist()), '\n')
print('F1:', '\n'+'\n'.join(f1.astype(str).tolist()), '\n')

AUC: 
0.9172806196071179
0.9591225688589473
0.9326098564479821
0.9764650483701156
0.7943664091235056
0.9367433089400096
0.8435979712253532
0.9320889598370901
0.8492060005699567
0.8399395402643108
0.7757201632559194
0.797367226058238
0.9677893583022228
0.8353470421371757
0.7873288949151485
0.6474953162762399
0.8154299754587948
0.8978013390726056 

ACC: 
0.9001956947162426
0.8655737704918033
0.8557993730407524
0.8645533141210374
0.7956600361663653
0.888468809073724
0.8692157817827569
0.8636363636363636
0.8711774265031762
0.871513102282333
0.8130539887187752
0.8516201325987487
0.9059080962800875
0.849290780141844
0.8036253776435045
0.8883572567783095
0.7770219198790628
0.8695652173913043 

KAPPA: 
0.8216258393051477
0.8043709520978692
0.785832088699045
0.8068271998294385
0.6360026639586969
0.8239441783854827
0.729667685465256
0.7947571508906383
0.7244741806107071
0.7074739344725447
0.6294707147352286
0.6712946307634797
0.860798050563509
0.7134694463039113
0.6258434391448512
0.640599410415