In [0]:
#coding=utf-8
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing, model_selection, neighbors, svm

FINGERPRINT = "FCFP4"
df = pd.read_csv("{0}.csv".format(FINGERPRINT))
df = df.drop(columns="row ID")


In [0]:
import multiprocessing as mp
print('CPU numbers:',mp.cpu_count())
def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = mp.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

CPU numbers: 4


In [0]:
df1 = df[df["C-C Bond Formation (Acylation)"]==1]
df0 = df[df["C-C Bond Formation (Acylation)"]==0].sample(n=df[df["C-C Bond Formation (Acylation)"]==1].shape[0], random_state=1)
df_equal = pd.concat([df0,df1])
print(df1.shape, df0.shape,df_equal.shape)
from sklearn.utils import shuffle
df_equal = shuffle(df_equal)

(271, 1061) (271, 1061) (542, 1061)


In [0]:
X = df_equal[df_equal.columns[list(df_equal.columns).index('bitvector0'):]] #fingerprint vectors 
y = df_equal[df_equal.columns[:list(df_equal.columns).index('bitvector0')]] #all the classes as targets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(433, 1024) (433, 37)
(109, 1024) (109, 37)


In [0]:
import pickle
pickle_out = open("{0}_X.pickle".format(FINGERPRINT), "wb")
pickle.dump(X, pickle_out)
pickle_out.close()

pickle_out = open("{0}_y.pickle".format(FINGERPRINT), "wb")
pickle.dump(y, pickle_out)
pickle_out.close()

In [0]:
import tensorflow as tf

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[1024]))
model.add(tf.keras.layers.Dense(64, activation=tf.nn.relu))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(64, activation=tf.nn.relu))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))

model.compile(optimizer = 'adam',
             loss = 'sparse_categorical_crossentropy', #和binary_crossentropy的区别
             metrics = ['accuracy'])
model.fit(X_train, y_train["C-C Bond Formation (Acylation)"], epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x21903b73d30>

In [0]:
val_loss, val_acc = model.evaluate(X_test, y_test["C-C Bond Formation (Acylation)"])
print("Validation Loss:",val_loss, "Validation Accuracy:",val_acc)

Validation Loss: 0.5368005759125456 Validation Accuracy: 0.82568806


In [0]:
predictions = model.predict(X[10:13])
print(predictions)
y[10:13]["C-C Bond Formation (Acylation)"]

[[0.00369966 0.99630034]
 [0.13246402 0.867536  ]
 [0.9983387  0.00166126]]


20909    1
8547     1
11437    0
Name: C-C Bond Formation (Acylation), dtype: int64

In [0]:
clf1 = neighbors.KNeighborsClassifier(n_neighbors=4)
#clf1 = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
clf1.fit(X_train, y_train['C-C Bond Formation (Acylation)'])
confidence = clf1.score(X_test, y_test['C-C Bond Formation (Acylation)'])
print(confidence)

0.7522935779816514


In [0]:
print(df_equal[10:13]['C-C Bond Formation (Acylation)'])
p = clf1.predict(X[10:13])
print(p)

20909    1
8547     1
11437    0
Name: C-C Bond Formation (Acylation), dtype: int64
[1 0 1]


In [0]:
#下面不能使用，因为前面的数据集是针对第一个分类equal sampling的，后面的类数据集将会非常不均衡
#res = []
for i in list(y.columns):
    print(i)
    clf = svm.SVC(gamma='auto')
    #clf = neighbors.KNeighborsClassifier()
    clf.fit(X_train, y_train[i])
    confidence = clf.score(X_test, y_test[i])
    #res.append([i,confidence])
    print(i,confidence)

In [0]:
res = []
for i in list(y.columns):
    clf = neighbors.KNeighborsClassifier(n_neighbors=3)
    clf.fit(X_train, y_train[i])
    confidence = clf.score(X_test, y_test[i])
    res.append(confidence)
    print(i,confidence)
print("Average:",np.average(res))

C-C Bond Formation (Acylation) 0.9912087912087912
C-C Bond Formation (Coupling) 0.9604395604395605
C-C Bond Formation (Methylation) 0.9626373626373627
C-C Bond Formation (Olefination) 0.9912087912087912
C-N Bond Formation (N-arylation) 0.9406593406593406
C-N Bond Formation (N-methylation) 0.9230769230769231
C-O Bond Formation (Etherification) 0.9472527472527472
Functional Conversion (Hydroxy to methoxy) 0.9714285714285714
Functional Introduction (Bromination) 0.9604395604395605
Functional Introduction (Hydroxylation) 0.9736263736263736
Functional Introduction (Iodination) 0.9648351648351648
Functional Introduction (Nitration) 0.9912087912087912
Synthesis (Pyrazole) 0.9912087912087912
C-C Bond Formation (Condensation) 0.989010989010989
C-N Bond Formation (Amination) 0.9648351648351648
C-N Bond Formation (Condensation) 0.9362637362637363
C-N Bond Formation (N-acetylation) 0.9714285714285714
C-N Bond Formation (N-alkylation) 0.978021978021978
C-N Bond Formation (Urea formation) 0.96703296