In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from mol2vec.features import mol2alt_sentence, sentences2vec
from gensim.models import word2vec
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed



In [2]:
def load_split(csv_path, smi_col, label_col, splits):
    df = pd.read_csv(csv_path)
    smis = df[smi_col].to_numpy()
    mols = np.array([AllChem.MolFromSmiles(i) for i in df[smi_col]])
    labels = df[label_col].to_numpy()
    valididx = ~np.isnan(labels)
    smis, mols, labels = smis[valididx], mols[valididx], labels[valididx]
    assert len(mols) == len(labels)
    assert np.isclose(np.sum(splits), 1.0)
    num_data = len(mols)
    index = np.random.permutation(num_data)
    mols, labels = mols[index], labels[index]
    part1 = int(num_data * splits[0])
    part2 = int(num_data * (splits[0]+splits[1]))
    mols_train, labels_train = mols[:part1], labels[:part1]
    mols_valid, labels_valid = mols[part1:part2], labels[part1:part2]
    mols_test, labels_test = mols[part2:], labels[part2:]
    assert len(mols_train) == len(labels_train)
    assert len(mols_valid) == len(labels_valid)
    assert len(mols_test) == len(labels_test)
    return (mols_train, labels_train), (mols_valid, labels_valid), (mols_test, labels_test)


In [3]:
if not Path('model_300dim.pkl').is_file():
    !wget https://github.com/samoturk/mol2vec/raw/master/examples/models/model_300dim.pkl

choices = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
model = word2vec.Word2Vec.load('model_300dim.pkl')
def train_eval(colname, unseen='UNK'):
    train, valid, test = load_split('./tox21.csv', 'smiles', colname, [0.8, 0.1, 0.1])
    train_Y =train[1]
    train_X = sentences2vec([mol2alt_sentence(mol, 1) for mol in train[0]], model, unseen=unseen)

    valid_Y = valid[1]
    valid_X = sentences2vec([mol2alt_sentence(mol, 1) for mol in valid[0]], model, unseen=unseen)
    
    test_Y = test[1]
    test_X = sentences2vec([mol2alt_sentence(mol, 1) for mol in test[0]], model, unseen=unseen)
    
    train_X = np.concatenate([train_X, valid_X])
    train_Y = np.concatenate([train_Y, valid_Y])
    
    clf = MLPClassifier(learning_rate='adaptive', early_stopping=True, validation_fraction=1/9)
    clf.fit(train_X, train_Y)
    pred = clf.predict_proba(test_X)[:, 1] # probability for positive label
    return roc_auc_score(test_Y, pred)

In [4]:
for idx, col in enumerate(choices):
    result = Parallel(n_jobs=10)(delayed(train_eval)(col) for _ in range(10))
    print(choices[idx], f"mean: {np.mean(result):.4f}", f"std: {np.std(result):.4f}", sep='\t')
    print(("{:.4f}\t"*10).format(*result))

NR-AR	mean: 0.7288	std: 0.0449
0.7272	0.7766	0.6775	0.7138	0.6723	0.7054	0.7060	0.7092	0.7876	0.8122	
NR-AR-LBD	mean: 0.8073	std: 0.0507
0.8813	0.7211	0.8311	0.7475	0.7692	0.7610	0.8306	0.8414	0.8619	0.8280	
NR-AhR	mean: 0.8640	std: 0.0228
0.8717	0.8409	0.8739	0.8681	0.8121	0.8698	0.8832	0.8616	0.9006	0.8576	
NR-Aromatase	mean: 0.7058	std: 0.1119
0.6693	0.7929	0.7270	0.7843	0.6397	0.8557	0.7859	0.7033	0.4320	0.6678	
NR-ER	mean: 0.6865	std: 0.0274
0.6598	0.6931	0.7230	0.6858	0.6255	0.6919	0.6712	0.7174	0.6891	0.7082	
NR-ER-LBD	mean: 0.7266	std: 0.0477
0.6910	0.6582	0.7049	0.7132	0.8131	0.7945	0.7726	0.7273	0.6965	0.6944	
NR-PPAR-gamma	mean: 0.6370	std: 0.1485
0.7292	0.7171	0.4836	0.4501	0.7975	0.7956	0.4041	0.6869	0.5134	0.7921	
SR-ARE	mean: 0.7759	std: 0.0394
0.8000	0.7737	0.7128	0.7662	0.7552	0.7587	0.8052	0.7227	0.8270	0.8381	
SR-ATAD5	mean: 0.7571	std: 0.0687
0.7251	0.8340	0.8036	0.8479	0.8249	0.7218	0.7801	0.6993	0.6227	0.7115	
SR-HSE	mean: 0.6617	std: 0.0684
0.5948	0.6948	0.7431	0