In [89]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pickle
import pandas as pd
from sklearn.model_selection import KFold

import lime
from lime import lime_tabular

import warnings
warnings.filterwarnings('ignore')
import json
import pickle

## Load data

In [54]:
# 1. Load data
data_path = 'data/processed/caco_maccs_all.csv'
df = pd.read_csv(data_path)

# 2. Split data to X and y 
X = df.loc[:, df.columns != df.columns[0]]
y = df[df.columns[0]]

X = np.array(X)
y = np.array(y)

# Lime

## RandomForestRegressor

In [56]:
def get_explanations(train_x, test_x, model):
    explanations = []
    new_columns_names = [x for x in range(0,166)]
    explainer = lime_tabular.LimeTabularExplainer(train_x, feature_names = new_columns_names, class_names=['permeability'], mode="regression", random_state=42)
    for example in test_x:
        explanation = explainer.explain_instance(example, model.predict, num_features=166)
        explanations += explanation.as_map()[1]
    return explanations

In [57]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
explanations = []
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X)))):
    train_x = X[train_idx]
    test_x = X[test_idx]
    train_y = y[train_idx]
    test_y = y[test_idx]
    model = pickle.load(open(f'models/caco_maccs_rf_{fold_1+1}.pkl', 'rb'))
    explanations = explanations + get_explanations(train_x, test_x, model)

In [65]:
pd.DataFrame(explanations).to_csv('explanations_caco_maccs_rf.csv')

In [73]:
explanations_rf = explanations

In [74]:
scores_rf = np.zeros(166)
for exp in explanations_rf:
    num, score = exp
    scores_rf[num] += score
scores_rf /= len(X)

In [82]:
np.argmax(abs(scores_rf))

36

In [88]:
top_f_rf = np.argsort(abs(scores_rf))[::-1]
top_f_rf

array([ 36,  67, 130,  44,  33,  51,  24,  28,  69,  46,  22, 113,  25,
         7, 145,  48,  20,  89,  52,  45, 121, 154,  68, 124,  49,  35,
       118,  18,  10, 102,  27,  77, 137,  41, 114, 132,  65, 156,  39,
        73,  14,  75,  58,  42, 128,  78, 108, 104,  90,  56,  32,  40,
        31,  96, 125,  26, 101,  98,  47, 139,  63,  70, 135,  95,  15,
        86,  72,  29,  97, 115,  84, 122,  94, 140, 112, 103, 117, 107,
       142, 143, 126,  92, 100,  54,  80,  71,  79,  74, 105,  59,  66,
        12,  50,  37,  57, 127,  55,  53,  16,  60,  87, 146,  76,  83,
       106,  82, 134, 129, 138,  81,  64,  23,  93,  88,  61,  62, 131,
       133, 148,  38, 111, 151, 123,  85,   6,   1,   2,   3,   4,   5,
         9,   8,  19,  11,  17,  21,  13, 165,  30,  34, 163, 162, 161,
       160, 159, 158, 157, 155, 153, 152, 150, 149, 147, 144, 141, 136,
       120, 119, 116, 110, 109,  99,  91, 164,  43,   0])

In [106]:
top_scores_rf = [scores_rf[tp] for tp in top_f_rf]
top_scores_rf

[-15.81720225187075,
 -7.298985079143284,
 -6.9426409244524,
 -6.388062800107298,
 -6.3578946336069135,
 -4.141416089624161,
 -2.7128179084553525,
 -2.6543703893985495,
 -2.2960117241988183,
 -1.6780143595679726,
 -1.572638514233279,
 -1.4530338692933777,
 -1.3822728365311205,
 -1.1715818552156267,
 -1.06881815808826,
 -0.8859142785499066,
 -0.8712550523301367,
 -0.8324460127772877,
 0.7620194548858786,
 -0.7204407800616633,
 -0.7066239613780689,
 -0.6903391995895811,
 -0.6894612818221392,
 -0.6672899522711258,
 -0.6266967420884327,
 -0.5982003890659295,
 -0.5741365722802052,
 -0.4817094139299478,
 -0.4670481482570559,
 -0.45283983304184805,
 -0.4429255603302425,
 -0.4354115032227754,
 -0.42543492993940646,
 -0.4032549968606807,
 -0.4013778427462118,
 -0.3860040481522615,
 -0.37541277059821043,
 -0.37439891915289747,
 -0.3550692592876539,
 -0.3510880943600455,
 -0.33692315932792133,
 -0.3157476214253147,
 -0.2854651047501932,
 0.27345860746025935,
 -0.27178955436280683,
 -0.26722032079

In [91]:
with open('maccs_keys_dict.pickle', 'rb') as fp:
    Maccs_keys_dict = pickle.load(fp)

In [156]:
def ExpToDf(indexes, explanations):
    """Function to convert explanation from lime to pandas dataframe with extra informations"""
    df_exp = pd.DataFrame({'Key': list(indexes), 'Influence': list(explanations)})
    df_exp['Key']+=1
    df_exp['Influence sign'] = np.where(df_exp['Influence'] < 0, "negative", np.where(df_exp['Influence'] >0, 'positive', 'neutral')) 
    df_exp = df_exp[['Key', 'Influence', 'Influence sign']]
    df_exp['Smarts'] = df_exp.apply(lambda x: Maccs_keys_dict[x.Key][0], axis=1)
    df_exp['Smarts'] = df_exp['Smarts'].str.replace('$', '$\$$')
    df_exp['Description'] = df_exp.apply(lambda x: Maccs_keys_dict[x.Key][1], axis=1)
    df_exp['Description'] = df_exp['Description'].str.replace('$', '$\$$')
    df1 = df_exp.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
    df1.set_properties(**{'text-align': 'center'}).hide_index()  
    return df1

In [142]:
ExpToDf(top_f_rf,top_scores_rf)

Key,Influence,Influence sign,Smarts,Description
37,-15.817202,negative,[#7]~[#6](~[#8])~[#7],NC(O)N
68,-7.298985,negative,[!#6;!#1;!H0]~[!#6;!#1;!H0],QHQH (&...)
131,-6.942641,negative,[!#6;!#1;!H0],QH > 1
45,-6.388063,negative,[#6]=[#6]~[#7],C=CN
34,-6.357895,negative,[CH2]=*,CH2=A
52,-4.141416,negative,[#7]~[#7],NN
25,-2.712818,negative,[#7]~[#6](~[#7])~[#7],NC(N)N
29,-2.65437,negative,[#15],P
70,-2.296012,negative,[!#6;!#1]~[#7]~[!#6;!#1],QNQ
47,-1.678014,negative,[#16]~*~[#7],SAN


## HistGradientBoostingRegressor

In [121]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
explanations = []
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X)))):
    train_x = X[train_idx]
    test_x = X[test_idx]
    train_y = y[train_idx]
    test_y = y[test_idx]
    model = pickle.load(open(f'models/caco_maccs_hist_{fold_1+1}.pkl', 'rb'))
    explanations = explanations + get_explanations(train_x, test_x, model)

In [122]:
explanations_hist = explanations

In [123]:
pd.DataFrame(explanations_hist).to_csv('explanations_caco_maccs_hist.csv')

In [124]:
scores_hist = np.zeros(166)
for exp in explanations_hist:
    num, score = exp
    scores_hist[num] += score
scores_hist /= len(X)

In [125]:
top_f_hist = np.argsort(abs(scores_hist))[::-1]
top_f_hist

array([ 67,  36, 130,  51,  44,  42,  46,  28, 154,  33,  58,  70,  89,
       113, 132,  23,  69,  52,  10, 145, 114,  77,  83,  48, 128,  49,
       125,  16,  78, 108, 129, 121, 124, 138,  97,  41,  14,  37,   7,
       139,  68, 156,  96,  56,  55,  35,  94,  29, 104, 101, 115, 137,
        71, 102,  84, 118, 112,  65,  31, 135, 107, 127, 126,  98,  95,
        40,  74,  45,  27,  82, 105,  90,  86,  88, 122,  47,  22,  63,
        38, 140,  18,  73, 117,  15, 131,  20,  75,  26,  61,  39,  53,
        80,  24, 134,  12, 103,  50,  81,  62,  66,  76,  25,  87,  72,
        64, 133,  32,  92,  59,  85, 100,  54, 151,  93, 148,  79,  60,
        57, 123, 142, 106, 143, 111, 146,   5,  17,  19,  21,   4,   3,
         2,   6,   1,   9,  11,  13,   8, 165,  30,  34, 163, 162, 161,
       160, 159, 158, 157, 155, 153, 152, 150, 149, 147, 144, 141, 136,
       120, 119, 116, 110, 109,  99,  91, 164,  43,   0])

In [126]:
top_scores_hist = [scores_hist[tp] for tp in top_f_hist]
top_scores_hist

[-8.5664482702949,
 -6.754716372014182,
 -4.715661670544067,
 -3.8376063325615717,
 -3.541059756632159,
 2.494216182606052,
 -2.3583657087919163,
 -1.9076252236720528,
 -1.657809921269998,
 -1.6405601754688064,
 1.6074974051838355,
 1.4871586380533037,
 -1.4715425643240008,
 -1.1786846137482134,
 -1.0498201651269052,
 1.043147968891804,
 -1.0252317745237853,
 0.9906165600135396,
 -0.9334830757010698,
 -0.9202494433953771,
 -0.9154643159839883,
 0.8828403382242607,
 0.866346469899764,
 -0.8441514795733022,
 -0.8412262269639361,
 -0.8284964233965265,
 -0.7675998316276526,
 0.763524441268017,
 -0.6384129109958021,
 -0.6206343055578439,
 0.5799054505437692,
 -0.5577053284032201,
 -0.5333783191263315,
 0.46856904257009735,
 0.4681278171225081,
 -0.4644723616737188,
 -0.4550250453258556,
 0.44107173161996993,
 -0.4268586929831499,
 0.37776288753499593,
 -0.367331956332718,
 -0.36038032215865,
 -0.35005734554164764,
 -0.33862611454737934,
 0.33022549803013296,
 -0.3093880327755754,
 0.3053854

In [143]:
ExpToDf(top_f_hist,top_scores_hist)

Key,Influence,Influence sign,Smarts,Description
68,-8.566448,negative,[!#6;!#1;!H0]~[!#6;!#1;!H0],QHQH (&...)
37,-6.754716,negative,[#7]~[#6](~[#8])~[#7],NC(O)N
131,-4.715662,negative,[!#6;!#1;!H0],QH > 1
52,-3.837606,negative,[#7]~[#7],NN
45,-3.54106,negative,[#6]=[#6]~[#7],C=CN
43,2.494216,positive,[!#6;!#1;!H0]~*~[!#6;!#1;!H0],QHAQH
47,-2.358366,negative,[#16]~*~[#7],SAN
29,-1.907625,negative,[#15],P
155,-1.65781,negative,*!@[CH2]!@*,A!CH2!A
34,-1.64056,negative,[CH2]=*,CH2=A


## LGBM

In [144]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
explanations = []
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X)))):
    train_x = X[train_idx]
    test_x = X[test_idx]
    train_y = y[train_idx]
    test_y = y[test_idx]
    model = pickle.load(open(f'models/caco_maccs_lgbm_{fold_1+1}.pkl', 'rb'))
    explanations = explanations + get_explanations(train_x, test_x, model)

In [145]:
explanations_lgbm = explanations

In [146]:
pd.DataFrame(explanations_lgbm).to_csv('explanations_caco_maccs_lgbm.csv')

In [150]:
scores_lgbm = np.zeros(166)
for exp in explanations_lgbm:
    num, score = exp
    scores_lgbm[num] += score
scores_lgbm /= len(X)

In [151]:
top_f_lgbm = np.argsort(abs(scores_lgbm))[::-1]
top_f_lgbm

array([ 67,  44,  33,  58,  51,  42,  36, 132, 130,  77,  22,  69, 125,
       113, 154, 128,  20,  83,  46,  70, 129,  71,  10,  49, 118,  89,
        52,  24,  23, 138, 112, 108,  75,  63,  25, 114, 101,  41, 145,
        37, 127,  45, 156,  96, 115,  56,  35, 121, 135,  14,  95,  88,
        94,  74,  73, 122, 102,  15,  31,  78, 139,   7,  61, 133, 137,
       104,  84,  32,  86, 117, 140,  82,  57,  97,  27,  80,  18,  59,
        81,  53, 105, 126, 107,  87,  48,  92,  93,  50, 103,  60,  90,
        72, 142,  62,  28,  55, 106,  39,  79,  66,  54,  65,  98, 146,
        38,  64, 131, 123, 134, 124, 151,  26, 148,  85, 100, 143,  40,
        29,  47,  12,  68,  76,  16, 111,   4,  19,   1,   2,   3,  17,
         5,   6,  13,   8,   9,  11,  21, 165,  30,  34, 163, 162, 161,
       160, 159, 158, 157, 155, 153, 152, 150, 149, 147, 144, 141, 136,
       120, 119, 116, 110, 109,  99,  91, 164,  43,   0])

In [152]:
top_scores_lgbm = [scores_lgbm[tp] for tp in top_f_lgbm]
top_scores_lgbm

[-8.46015459393616,
 -2.8887903984265946,
 -2.7876433931078415,
 2.335614166800177,
 -2.2064506786449622,
 2.156197335916426,
 -1.8191722202389589,
 -1.7946409319896064,
 -1.6945615205755118,
 1.397518125780343,
 1.3972409713830436,
 -1.331053857146559,
 -1.2958035258404488,
 -1.2690350601747458,
 -1.23195558150398,
 -1.1910363265558515,
 1.171027013637234,
 1.1591522574559794,
 -1.1423857772920152,
 1.0332445709638447,
 0.9640595212932974,
 0.9615789614870025,
 -0.945748013430341,
 -0.930173414922441,
 -0.9146727002137925,
 -0.8581839864331502,
 0.8452108716362317,
 -0.8435050977276746,
 0.7938843820915427,
 0.7554965062895885,
 0.6858844575503265,
 -0.6582025720759817,
 -0.6574742587402628,
 0.6403723434838197,
 -0.6328747097927306,
 -0.5431443178317928,
 0.5272965400934762,
 -0.5034303703318859,
 -0.498358677452105,
 0.4946546437270913,
 0.437808398824318,
 -0.4137313112586508,
 -0.4015834374192522,
 -0.3841047133544693,
 0.37078765606797387,
 -0.3650508706036536,
 -0.33781933806496

In [153]:
ExpToDf(top_f_lgbm,top_scores_lgbm)

Key,Influence,Influence sign,Smarts,Description
68,-8.460155,negative,[!#6;!#1;!H0]~[!#6;!#1;!H0],QHQH (&...)
45,-2.88879,negative,[#6]=[#6]~[#7],C=CN
34,-2.787643,negative,[CH2]=*,CH2=A
59,2.335614,positive,[#16]!:*:*,Snot%A%A
52,-2.206451,negative,[#7]~[#7],NN
43,2.156197,positive,[!#6;!#1;!H0]~*~[!#6;!#1;!H0],QHAQH
37,-1.819172,negative,[#7]~[#6](~[#8])~[#7],NC(O)N
133,-1.794641,negative,*@*!@[#7],A$\$$A!N
131,-1.694562,negative,[!#6;!#1;!H0],QH > 1
78,1.397518,positive,[#6]=[#7],C=N


## SVR

In [157]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
explanations = []
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X)))):
    train_x = X[train_idx]
    test_x = X[test_idx]
    train_y = y[train_idx]
    test_y = y[test_idx]
    model = pickle.load(open(f'models/caco_maccs_svr_{fold_1+1}.pkl', 'rb'))
    explanations = explanations + get_explanations(train_x, test_x, model)

explanations_svr = explanations

In [158]:
pd.DataFrame(explanations_svr).to_csv('explanations_caco_maccs_svr.csv')

In [159]:
scores_svr = np.zeros(166)
for exp in explanations_svr:
    num, score = exp
    scores_svr[num] += score
scores_svr /= len(X)

In [160]:
top_f_svr = np.argsort(abs(scores_svr))[::-1]
top_f_svr

array([ 67,  28, 130,  70,  42,  51,  77,  58, 113,  23,  10,  46,  83,
        52,  47,  63,  45, 121,  27,  69, 122,  41,  54,  36,   7,  16,
       129,  89, 125, 154, 112,  44, 138, 132,  49,  56,  15,  35,  33,
        53,  94,  14, 108, 137, 128,  57, 127,  59,  97,  72, 114, 100,
       145,  39,  88,  95,  32, 143, 146,  84,  29,  71, 124,  24,  31,
        55,  62, 135,  22,  79,  65,  78, 115,  73,  25, 103, 102, 139,
        40,  75, 104,  93, 156, 117, 123,  90,  26,  12,  92,  38,  98,
        68, 118,  96,  74,  76, 142, 126, 107,  48, 101, 106,  61,  20,
       148,  66,  87,  80,  81, 140,  60,  37, 133, 151,  64,  82, 134,
       105,  86,  18, 131,  85,  50, 111,  11,   8,  19,   6,  13,   5,
         4,  21,   3,   2,  17,   1,   9, 165,  30,  34, 163, 162, 161,
       160, 159, 158, 157, 155, 153, 152, 150, 149, 147, 144, 141, 136,
       120, 119, 116, 110, 109,  99,  91, 164,  43,   0])

In [161]:
top_scores_svr = [scores_svr[tp] for tp in top_f_svr]
top_scores_svr

[-2.207108484100218,
 -2.0271772148581078,
 -1.7530608835148178,
 1.6842925386575485,
 1.6016696366775807,
 -1.5604469494209445,
 1.4971339629693128,
 1.4235419037049772,
 -1.3159712286953094,
 1.1819231454008774,
 -1.1017077976174365,
 -1.093887373853639,
 1.0859163138626724,
 1.0579351776769634,
 -0.999515005115362,
 0.964194332952931,
 -0.9066120486259894,
 -0.8962127516853685,
 -0.8722730259980918,
 -0.825187890180051,
 0.8157137954067587,
 -0.7988297309089282,
 0.7547471177866223,
 -0.7355953808228122,
 -0.7167745072142513,
 0.662068508795294,
 0.6449964448989084,
 -0.634776536460675,
 -0.6341945356601612,
 -0.5945412696228551,
 0.5917396723206835,
 -0.5781196071682593,
 0.5733425490312697,
 -0.5710950264287937,
 0.5577923082082037,
 -0.5417798375907711,
 0.5353866520150915,
 -0.5147635181492767,
 -0.4868636020558718,
 0.46052139676209525,
 0.4420171206712084,
 -0.424803445867705,
 -0.4200377785318793,
 -0.3813488599308219,
 -0.3798466425071826,
 -0.3631466229568537,
 0.3630721642

In [163]:
ExpToDf(top_f_svr,top_scores_svr)

Key,Influence,Influence sign,Smarts,Description
68,-2.207108,negative,[!#6;!#1;!H0]~[!#6;!#1;!H0],QHQH (&...)
29,-2.027177,negative,[#15],P
131,-1.753061,negative,[!#6;!#1;!H0],QH > 1
71,1.684293,positive,[#7]~[#8],NO
43,1.60167,positive,[!#6;!#1;!H0]~*~[!#6;!#1;!H0],QHAQH
52,-1.560447,negative,[#7]~[#7],NN
78,1.497134,positive,[#6]=[#7],C=N
59,1.423542,positive,[#16]!:*:*,Snot%A%A
114,-1.315971,negative,[CH3]~[CH2]~*,CH3CH2A
24,1.181923,positive,[#7]-[#8],N-O


## Xgboost

In [164]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
explanations = []
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X)))):
    train_x = X[train_idx]
    test_x = X[test_idx]
    train_y = y[train_idx]
    test_y = y[test_idx]
    model = pickle.load(open(f'models/caco_maccs_xgboost_{fold_1+1}.pkl', 'rb'))
    explanations = explanations + get_explanations(train_x, test_x, model)

explanations_xgboost = explanations

In [165]:
pd.DataFrame(explanations_xgboost).to_csv('explanations_caco_maccs_xgboost.csv')

In [166]:
scores_xgboost = np.zeros(166)
for exp in explanations_xgboost:
    num, score = exp
    scores_xgboost[num] += score
scores_xgboost /= len(X)

In [167]:
top_f_xgboost = np.argsort(abs(scores_xgboost))[::-1]
top_f_xgboost

array([ 67,  77,  36,  42,  46,  51, 130, 132, 154, 118,  58,  89, 114,
       125,  22, 122,  70,  35, 128,  83,  44, 113,  24, 129,  94,  52,
        71,  41, 138, 145, 156,  78, 108, 112,  15,  90, 102, 117,  56,
        74, 146,  37,  98, 135, 101,  97, 127, 124,  79,  95, 133,  96,
        86, 107,  68,  23,  63, 115,  69,  48,  49, 139,  57,  84,  31,
        81,  82,  73,  10, 134,  88, 106, 104,  87,  92,  59,  61,  75,
       126,  38,  40, 140,  29,  60,  50, 121,  72,  64, 142, 105, 137,
        93,   7,  80,  53, 151,  32, 131, 103,  25,  26,  54,  55,  76,
        27, 143,  65, 123,  12,  28,  47, 111,  45,  62, 100,  18,  14,
       148,  85,  16,  39,  20,  33,  66,  17,   6,   5,   8,  13,  19,
         4,   3,   2,   9,   1,  21,  11, 165,  30,  34, 163, 162, 161,
       160, 159, 158, 157, 155, 153, 152, 150, 149, 147, 144, 141, 136,
       120, 119, 116, 110, 109,  99,  91, 164,  43,   0])

In [168]:
top_scores_xgboost = [scores_xgboost[tp] for tp in top_f_xgboost]
top_scores_xgboost

[-9.46518620301002,
 6.07953036316362,
 -4.90746727360544,
 4.224333110020428,
 -3.883089916564899,
 -3.639171207775545,
 -3.408290841204925,
 -2.8006331179993884,
 -2.633928751422297,
 -2.4516939609757524,
 2.392287027840436,
 -2.025329318865311,
 -1.9034890906605284,
 -1.7020712119252925,
 1.6420505776719976,
 1.5630612596456996,
 1.4508198565234027,
 -1.4070241765526537,
 -1.393104453941095,
 1.2925792076946336,
 -1.255820616865387,
 -1.243806344922181,
 -1.235067808880989,
 1.1620699466475521,
 1.1315438982883719,
 1.0544666027952394,
 1.0169851487812869,
 -1.0087319515262667,
 0.99683466217316,
 -0.974158795963582,
 -0.9430329089740823,
 -0.8805854984811764,
 -0.8644478355166372,
 0.8271912225795466,
 0.8130150239470926,
 0.6964298540297436,
 0.690671312461755,
 -0.6669719193622231,
 -0.6554620641509789,
 0.6352514824311958,
 0.6295047972387436,
 0.5882848360347548,
 -0.571964299130814,
 -0.5525815744400346,
 0.5414064323765606,
 0.5199429877940505,
 0.5149652142766302,
 0.4944535

In [169]:
ExpToDf(top_f_xgboost,top_scores_xgboost)

Key,Influence,Influence sign,Smarts,Description
68,-9.465186,negative,[!#6;!#1;!H0]~[!#6;!#1;!H0],QHQH (&...)
78,6.07953,positive,[#6]=[#7],C=N
37,-4.907467,negative,[#7]~[#6](~[#8])~[#7],NC(O)N
43,4.224333,positive,[!#6;!#1;!H0]~*~[!#6;!#1;!H0],QHAQH
47,-3.88309,negative,[#16]~*~[#7],SAN
52,-3.639171,negative,[#7]~[#7],NN
131,-3.408291,negative,[!#6;!#1;!H0],QH > 1
133,-2.800633,negative,*@*!@[#7],A$\$$A!N
155,-2.633929,negative,*!@[CH2]!@*,A!CH2!A
119,-2.451694,negative,[#7]=*,N=A
