In [17]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,StratifiedKFold
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import random
import re
import torch
import joblib

In [18]:
fname = './data/processed/BTK_processed_ac.csv'
df = pd.read_csv(fname)
p_mask = df['label'] == 1
n_mask = df['label'] == 0
print(sum(p_mask))
print(sum(n_mask))

69
37


In [19]:
seed = 10
random.seed(seed)
np.random.seed(seed)
df_mix = pd.concat([df[p_mask].sample(37),df[n_mask].sample(37)])
# df_mix = pd.concat([df[p_mask],df[n_mask]])

In [20]:
df_mix = df_mix.sample(frac=1)

In [21]:
smiles_list = df_mix['smiles'].values
y = df_mix['label'].values
sum(y==1)

37

In [22]:
x = []
for smiles in smiles_list:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x.append(fp_bits)
x = np.array(x)
x.shape

(74, 167)

In [23]:
sfolder = StratifiedKFold(n_splits=3,random_state=1,shuffle=True)
splits = list(sfolder.split(x,y))

In [24]:
f1_list = []
precision_list =  []
for train, test in sfolder.split(x,y):
    regr = make_pipeline(SVC(probability = True))
    regr.fit(x[train], y[train])
    pred = regr.predict(x[test])
    f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
    p_score = precision_score(y[test], pred, average='micro')
    f1_list.append(f1)
    precision_list.append(p_score)
    print(f"f1: {f1}; precision: {p_score}.")

f1_ts = torch.Tensor(f1_list)
precision_ts = torch.Tensor(precision_list)

print(f"F1 mean: {f1_ts.mean()}, std: {f1_ts.std()}.")
print(f"Precision mean: {precision_ts.mean()}, std: {precision_ts.std()}.")


f1: 0.44; precision: 0.44.
f1: 0.76; precision: 0.76.
f1: 0.625; precision: 0.625.
F1 mean: 0.6083333492279053, std: 0.16064971685409546.
Precision mean: 0.6083333492279053, std: 0.16064971685409546.


In [37]:
train = splits[1][0]
test = splits[1][1]

regr = make_pipeline(SVC(probability = True))
regr.fit(x[train], y[train])
pred = regr.predict(x[test])
f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
p_score = precision_score(y[test], pred, average='micro')
f1_list.append(f1)
precision_list.append(p_score)
print(f"f1: {f1}; precision: {p_score}.")

joblib.dump(regr, "SVC.m")

f1: 0.76; precision: 0.76.


['SVC.m']

# 加入额外的数据

In [25]:
df_positive = pd.read_csv('./data/positive.csv')
smiles_positive = df_positive['smiles'].values
y_positive = df_positive['label'].values

df_negtive = pd.read_csv('./data/negtive.csv')
smiles_negtive = df_negtive['smiles'].values
y_negtive = df_negtive['label'].values

In [26]:
x_positive = []
y_positive = []
################# 正样本
for smiles in smiles_positive:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x_positive.append(fp_bits)
    y_positive.append(1)
x_positive = np.array(x_positive)
y_positive = np.array(y_positive)

x_negtive = []
y_negtive = []
################# 负样本
for smiles in smiles_negtive:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x_negtive.append(fp_bits)
    y_negtive.append(0)
x_negtive = np.array(x_negtive)
y_negtive = np.array(y_negtive)

In [27]:
idx_positive = [i for i in range(len(x_positive))]
idx_negtive = [i for i in range(len(x_negtive))]

In [28]:
print(len(idx_positive))
print(len(idx_negtive))

1010
901


In [11]:
sample_list = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]

f1_score_mean = []
f1_score_std = []

for sample_num in sample_list:
    random.seed(100)

    idx_p = random.sample(idx_positive, sample_num)
    idx_n = random.sample(idx_negtive, sample_num)

    x_extra = np.concatenate([x_positive[idx_p],x_negtive[idx_n]])
    y_extra = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])

    idx_s = [i for i in range(len(y_extra))]
    random.shuffle(idx_s)
    x_extra = x_extra[idx_s]
    y_extra = y_extra[idx_s]

    f1_list = []
    precision_list =  []
    for train, test in sfolder.split(x,y):
        regr = make_pipeline(SVC(probability = True))
        regr.fit(np.concatenate([x[train], x_extra]), np.concatenate([y[train], y_extra]))
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')

        f1_list.append(f1)
        precision_list.append(p_score)
        # print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    f1_score_mean.append(round(f1_ts.mean().item(),4))
    f1_score_std.append(round(f1_ts.std().item(),4))

f1_score_mean

[0.6519, 0.6614, 0.6802, 0.6802, 0.6709, 0.6995, 0.7085, 0.7082, 0.7455, 0.736]

In [29]:
sample_list = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]

f1_score_mean = []
f1_score_std = []

idx = 1
train = splits[idx][0]
test = splits[idx][1]

for sample_num in sample_list:
    f1_list = []
    precision_list =  []
    for seed in range(50,150):

        random.seed(seed)

        idx_p = random.sample(idx_positive, sample_num)
        idx_n = random.sample(idx_negtive, sample_num)

        x_extra = np.concatenate([x_positive[idx_p],x_negtive[idx_n]])
        y_extra = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])

        idx_s = [i for i in range(len(y_extra))]
        random.shuffle(idx_s)
        x_extra = x_extra[idx_s]
        y_extra = y_extra[idx_s]

        regr = make_pipeline(SVC(probability = True))
        regr.fit(np.concatenate([x[train], x_extra]), np.concatenate([y[train], y_extra]))
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')

        f1_list.append(f1)
        precision_list.append(p_score)
        # print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    f1_score_mean.append(round(f1_ts.mean().item(),4))
    f1_score_std.append(round(f1_ts.std().item(),4))

print(f1_score_mean)
print(f1_score_std)

[0.7552, 0.7616, 0.7736, 0.7756, 0.7784, 0.7828, 0.7772, 0.7828, 0.7792, 0.7868, 0.794, 0.7932, 0.7916, 0.7928, 0.798, 0.8012, 0.7996, 0.798, 0.7984, 0.8008]
[0.0378, 0.0465, 0.0374, 0.0465, 0.0419, 0.0419, 0.0387, 0.0407, 0.0442, 0.0433, 0.0366, 0.036, 0.0347, 0.0387, 0.0383, 0.0375, 0.0392, 0.0395, 0.0331, 0.0368]


In [30]:
outputs = []
for i in range(len(sample_list)):
    outputs.append([sample_list[i],f1_score_mean[i],f1_score_std])
outputs = pd.DataFrame(data=outputs, columns=['sample', 'mean','std'])
outputs.to_csv('../log/SD_BTK.csv', index=False)