In [1]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
import numpy as np

from rdkit.Chem import MACCSkeys
import pickle
from exmol_our import exmol
from functools import partial
import preprocess

from sklearn.ensemble import RandomForestRegressor
import pickle
import pandas as pd
from sklearn.model_selection import KFold
import random

import warnings
warnings.filterwarnings('ignore')
import json
import pickle
import scipy.cluster.hierarchy as hc

from rdkit import Chem  
from rdkit.Chem import MolFromSmiles as smi2mol 

from rdkit.Chem import AllChem  
from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity, TanimotoSimilarity 
from rdkit.Chem import Mol  
from functions import from_smiles

import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
data_type = 'caco'
feature_type = 'klek'
model_type = 'svr'

## Load fingerprint data

In [3]:
data_path = f'../../data/processed/{data_type}_{feature_type}_all.csv'
df = pd.read_csv(data_path)

# 2. Split data to X and y 
X_fp = df.loc[:, df.columns != df.columns[0]]
y_fp = df[df.columns[0]]

X = df.loc[:, df.columns != df.columns[0]]
idxs = preprocess.remove_low_variance(X, threshold=0.01)
idxs_proper = [int(i.replace('KRFP', ''))-1 for i in idxs]

X_fp = X_fp[idxs.tolist()]

X_fp = np.array(X_fp)
y_fp = np.array(y_fp)

## Get predictions

In [4]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
predictions = np.zeros((len(X_fp)))
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X_fp)))):
    train_x = X_fp[train_idx]
    test_x = X_fp[test_idx]
    model = pickle.load(open(f'../../models/{data_type}_{feature_type}_{model_type}_{fold_1+1}.pkl', 'rb'))
    predictions[test_idx] = model.predict(test_x)

## Load smiles data

In [5]:
data_path = f'../../data/processed/{data_type}_smiles_all.csv'
df = pd.read_csv(data_path)
df['predictions'] = predictions
df = df[df['predictions']<=10] # get inactive examples

# 2. Split data to X and y 
X_sm = df[['smiles']]
y_sm = df[['permeability']]

In [6]:
len(X_sm)

1603

## Clustering

In [7]:
ECFP4 = [AllChem.GetMorganFingerprint(Chem.MolFromSmiles(smiles[0]), 2) for smiles in X_sm.values]
M = np.array([BulkTanimotoSimilarity(f, ECFP4) for f in ECFP4])
M2 = 1 - M
dist_df = pd.DataFrame(M2, index = X_sm.values, columns= X_sm.values)
clustered = hc.linkage(M2, method='complete')

In [8]:
from scipy.cluster.hierarchy import cut_tree
n_clusters = 20
clusters = cut_tree(clustered, n_clusters = n_clusters).T
df_clustered = pd.DataFrame({'Column1': list(X_sm.values), 'Column2': clusters[0]})

In [9]:
from collections import Counter
c = Counter(list(df_clustered['Column2']))
sorted(c.items(), key=lambda i: i[1])

[(17, 16),
 (16, 21),
 (3, 27),
 (19, 28),
 (2, 34),
 (12, 34),
 (14, 36),
 (7, 39),
 (9, 45),
 (18, 47),
 (10, 59),
 (13, 61),
 (15, 67),
 (5, 71),
 (8, 104),
 (0, 126),
 (11, 130),
 (6, 145),
 (4, 186),
 (1, 327)]

## Get samples

In [None]:
random.seed(42)
np.random.seed(42)
n_clusters = 100
x = 1000/len(X_sm.values)
samples = []
for i in range(n_clusters):
    C_i = np.where(df_clustered.Column2 == i)[0].tolist() 
    n_i = len(C_i)
    sample_i = random.sample(C_i, round(x * n_i)) 
    samples += list(sample_i)

## Exmol

In [11]:
def GetKlekFp(smiles):
    fp = from_smiles(smiles, fingerprints=True, descriptors=False, fingerprint_type='klek').values()
    return np.array(list(fp), dtype=np.int8)[np.array(idxs_proper)]

In [12]:
def model_predict(smiles, model):
    fingerprint = GetKlekFp(smiles)
    prediction  = model.predict(fingerprint.reshape(1,-1))
    return prediction

In [13]:
def get_changed_idxs(cfs):
    orignal_smiles = cfs[0].smiles
    orignal_fs = GetKlekFp(orignal_smiles)
    counterfc_fp = [GetKlekFp(counterfc.smiles) for counterfc in cfs[1:] if counterfc.label.startswith('Increase')]
    return [(np.where((orignal_fs-cfp)==1)[0], np.where((orignal_fs-cfp)==-1)[0]) for cfp in counterfc_fp] # (-,+)

In [14]:
def get_cfs(examples, model, thd):
    changed_indx_list = []
    for exp in examples.values:
        space = exmol.sample_space(exp[0], partial(model_predict, model=model), preset="medium", num_samples=150, batched=False)
        prediction = model_predict(exp[0], model)[0]
        if prediction<0:
            radius = round(float(thd+abs(prediction)),2)
        else:
            radius = round(float(thd-prediction),2)
        cfs = exmol.rcf_explain(space, radius, nmols=4)
        changed_idxs = get_changed_idxs(cfs)
        changed_indx_list += changed_idxs
    return changed_indx_list 

In [15]:
first_splits = KFold(n_splits=5, shuffle=True, random_state=42)
changed_indx_list = []
for fold_1, (train_idx,test_idx) in enumerate(first_splits.split(np.arange(len(X_sm)))):
    test_idx = np.array(list(set(test_idx) & set(samples)))
    test_x = X_sm.iloc[test_idx]
    model = pickle.load(open(f'../../models/{data_type}_{feature_type}_{model_type}_{fold_1+1}.pkl', 'rb'))
    changed_indx_list += get_cfs(test_x, model, 10.0)

🤘Done🤘: 100%|██████████| 43.0/43 [00:00<00:00, 997.84it/s]                       
🤘Done🤘: 100%|██████████| 48.0/48 [00:00<00:00, 975.69it/s]                       
🤘Done🤘: 100%|██████████| 52.0/52 [00:00<00:00, 1063.13it/s]                      
🤘Done🤘: 100%|██████████| 38.0/38 [00:00<00:00, 1082.31it/s]                      
🤘Done🤘: 100%|██████████| 41.0/41 [00:00<00:00, 1112.23it/s]                      
🤘Done🤘: 100%|██████████| 53.0/53 [00:00<00:00, 853.33it/s]                       
🤘Done🤘: 100%|██████████| 48.0/48 [00:00<00:00, 1094.12it/s]                      
🤘Done🤘: 100%|██████████| 56.0/56 [00:00<00:00, 1070.00it/s]                      
🤘Done🤘: 100%|██████████| 37.0/37 [00:00<00:00, 1083.10it/s]                      
🤘Done🤘: 100%|██████████| 53.0/53 [00:00<00:00, 594.01it/s]                       
🤘Done🤘: 100%|██████████| 69.0/69 [00:00<00:00, 1112.99it/s]                      
🤘Done🤘: 100%|██████████| 29.0/29 [00:00<00:00, 1128.21it/s]                      
🤘Done🤘: 100%|███

In [16]:
len(changed_indx_list)

117

#### Mean removed fingerprints

In [17]:
np.mean([len(list(c[0])) for c in changed_indx_list])

9.205128205128204

#### Mean added fingerprints

In [18]:
np.mean([len(list(c[1])) for c in changed_indx_list])

5.863247863247863

#### Most frequent removed fingerprints

In [19]:
with open('../../klek_keys_dict.pickle', 'rb') as fp:
    Klek_keys_dict = pickle.load(fp)

In [20]:
def map_to_smarts(df):
    df_mapped = df + 1
    df_mapped = df_mapped.applymap(lambda x: Klek_keys_dict[x])
    return df_mapped

In [21]:
def map_to_proper(df):
    df_mapped = df.applymap(lambda x: idxs_proper[x])
    return df_mapped

In [22]:
from collections import Counter
counts = Counter([item for sublist in [list(c[0]) for c in changed_indx_list] for item in sublist])
print(len(counts))
sorted_counts = sorted(counts.items(), key=lambda i: i[1], reverse=True)[:10]
df_sorted_counts = pd.DataFrame({'Smarts': [x[0] for x in sorted_counts], 'Counts': [x[1] for x in sorted_counts]})
df_sorted_counts = map_to_proper(df_sorted_counts)
df_sorted_counts

277


Unnamed: 0,Smarts,Counts
0,604,135
1,3914,132
2,3156,125
3,222,125
4,3660,125
5,4293,125
6,3401,101
7,3740,92
8,3781,91
9,838,72


In [23]:
df_sorted_counts['Smarts'] = map_to_smarts(df_sorted_counts[['Smarts']])
df_sorted_counts.to_csv(f'files/{data_type}/{data_type}_{feature_type}_{model_type}_most_frequent_removed_fps.csv', index=False)
df_sorted_counts

Unnamed: 0,Smarts,Counts
0,[!#1][CH2]N([!#1])[CH2][!#1],135
1,CN1CCCCC1,132
2,C1CCNCC1,125
3,[!#1][CH]1[CH2][CH2][CH2]N([!#1])[CH2]1,125
4,CCC(C)N,125
5,NCCN,125
6,CC(C)N,101
7,CCCN(C)C,92
8,CCNCC,91
9,[!#1][NH]C(=O)[!#1],72


#### Most frequent added fingerprints

In [24]:
counts = Counter([item for sublist in [list(c[1]) for c in changed_indx_list] for item in sublist])
print(len(counts))
sorted_counts = sorted(counts.items(), key=lambda i: i[1], reverse=True)[:10]
df_sorted_counts = pd.DataFrame({'Smarts': [x[0] for x in sorted_counts], 'Counts': [x[1] for x in sorted_counts]})
df_sorted_counts = map_to_proper(df_sorted_counts)
df_sorted_counts

241


Unnamed: 0,Smarts,Counts
0,2985,92
1,3057,72
2,3439,72
3,3442,66
4,3906,44
5,3368,32
6,676,32
7,125,24
8,3395,24
9,3767,24


In [25]:
df_sorted_counts['Smarts'] = map_to_smarts(df_sorted_counts[['Smarts']])
df_sorted_counts.to_csv(f'files/{data_type}/{data_type}_{feature_type}_{model_type}_most_frequent_added_fps.csv', index=False)
df_sorted_counts

Unnamed: 0,Smarts,Counts
0,C=C,92
1,C1CC1,72
2,CC=C,72
3,CC=N,66
4,CN=C,44
5,CC(C)C,32
6,[!#1][NH][!#1],32
7,[!#1][CH]=[CH][!#1],24
8,CC(C)CN,24
9,CCN=C,24
