In [1]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,StratifiedKFold
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import random
import re
import torch

In [2]:
fname = './data/processed/3CL_processed_ac.csv'
df = pd.read_csv(fname)
# df

In [3]:
p_mask = df['label'] == 1
n_mask = df['label'] == 0

In [4]:
print(sum(p_mask))
print(sum(n_mask))

84
363


In [5]:
seed = 10
random.seed(seed)
np.random.seed(seed)
df_mix = pd.concat([df[p_mask].sample(50),df[n_mask].sample(50)])
# df_mix = pd.concat([df[p_mask],df[n_mask]])

In [6]:
df_mix = df_mix.sample(frac=1)
# df_mix.to_csv(f'./data/3CL_enzymatic/processed/3CL_enzymatic_processed_ac.csv', index=False)

In [7]:
smiles_list = df_mix['smiles'].values
y = df_mix['label'].values
sum(y==1)

50

In [8]:
x = []
for smiles in smiles_list:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x.append(fp_bits)
x = np.array(x)
x.shape

(100, 167)

In [9]:
sfolder = StratifiedKFold(n_splits=3,random_state=5,shuffle=True)
# sfolder = StratifiedKFold(n_splits=3,random_state=5,shuffle=True)

In [10]:
splits = list(sfolder.split(x,y))

In [11]:
f1_list = []
precision_list =  []

# idx = 2
# train = splits[idx][0]
# test = splits[idx][1]

for train, test in sfolder.split(x,y):
    regr = make_pipeline(SVC(probability = True))
    regr.fit(x[train], y[train])
    pred = regr.predict(x[test])
    f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
    p_score = precision_score(y[test], pred, average='micro')
    f1_list.append(f1)
    precision_list.append(p_score)
    print(f"f1: {f1}; precision: {p_score}.")

    # train_data = []
    # for idx in train:
    #     train_data.append(f"{df_mix['index'].values[idx]}.png")
    # train_data = pd.DataFrame(data=train_data, columns=['name'])
    # train_data.to_csv(f'./data/3CL_enzymatic/train.csv', index=False)

    # test_data = []
    # for idx in test:
    #     test_data.append(f"{df_mix['index'].values[idx]}.png")
    # test_data = pd.DataFrame(data=test_data, columns=['name'])
    # test_data.to_csv(f'./data/3CL_enzymatic/test.csv', index=False)

f1_ts = torch.Tensor(f1_list)
precision_ts = torch.Tensor(precision_list)

print(f"F1 mean: {f1_ts.mean()}, std: {f1_ts.std()}.")
print(f"Precision mean: {precision_ts.mean()}, std: {precision_ts.std()}.")


f1: 0.735294117647059; precision: 0.7352941176470589.
f1: 0.48484848484848486; precision: 0.48484848484848486.
f1: 0.5151515151515151; precision: 0.5151515151515151.
F1 mean: 0.5784313678741455, std: 0.1366894543170929.
Precision mean: 0.5784313678741455, std: 0.1366894543170929.


In [None]:
import joblib
joblib.dump(regr, "SVC.m")

# 加入额外的数据

In [12]:
df_positive = pd.read_csv('./data/positive.csv')
smiles_positive = df_positive['smiles'].values
y_positive = df_positive['label'].values

df_negtive = pd.read_csv('./data/negtive.csv')
smiles_negtive = df_negtive['smiles'].values
y_negtive = df_negtive['label'].values

In [13]:
x_positive = []
y_positive = []
################# 正样本
for smiles in smiles_positive:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x_positive.append(fp_bits)
    y_positive.append(1)
x_positive = np.array(x_positive)
y_positive = np.array(y_positive)

x_negtive = []
y_negtive = []
################# 负样本
for smiles in smiles_negtive:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x_negtive.append(fp_bits)
    y_negtive.append(0)
x_negtive = np.array(x_negtive)
y_negtive = np.array(y_negtive)

In [14]:
idx_positive = [i for i in range(len(x_positive))]
idx_negtive = [i for i in range(len(x_negtive))]

In [None]:
sample_list = [5,10,15,20,25,30,35,40,45,50]

f1_score_mean = []
f1_score_std = []
for i in range(len(sample_list)):
    sample_num = sample_list[i]

    f1_list = []
    precision_list =  []
    for train, test in sfolder.split(x,y):
        random.seed(100)

        idx_p = random.sample(idx_positive, sample_num)
        idx_n = random.sample(idx_negtive, sample_num)

        x_extra = np.concatenate([x_positive[idx_p],x_negtive[idx_n]])
        y_extra = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])

        idx_s = [i for i in range(len(y_extra))]
        random.shuffle(idx_s)
        x_extra = x_extra[idx_s]
        y_extra = y_extra[idx_s]



        regr = make_pipeline(SVC(probability = True))
        regr.fit(np.concatenate([x[train], x_extra]), np.concatenate([y[train], y_extra]))
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')

        f1_list.append(f1)
        precision_list.append(p_score)
        # print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    f1_score_mean.append(round(f1_ts.mean().item(),4))
    f1_score_std.append(round(f1_ts.std().item(),4))

print(f1_score_mean)
print(f1_score_std)

In [15]:
# sample_list = [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
sample_list = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]
# sample_list = [20,40,60,80,100,120,140,160,180,200]

idx = 0
train = splits[idx][0]
test = splits[idx][1]

f1_score_mean = []
f1_score_std = []
for i in range(len(sample_list)):
    sample_num = sample_list[i]

    f1_list = []
    precision_list =  []
    for seed in range(50,150):
        random.seed(seed)

        idx_p = random.sample(idx_positive, sample_num)
        idx_n = random.sample(idx_negtive, sample_num)

        x_extra = np.concatenate([x_positive[idx_p],x_negtive[idx_n]])
        y_extra = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])

        idx_s = [i for i in range(len(y_extra))]
        random.shuffle(idx_s)
        x_extra = x_extra[idx_s]
        y_extra = y_extra[idx_s]



        regr = make_pipeline(SVC(probability = True))
        regr.fit(np.concatenate([x[train], x_extra]), np.concatenate([y[train], y_extra]))
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')

        f1_list.append(f1)
        precision_list.append(p_score)
        # print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    f1_score_mean.append(round(f1_ts.mean().item(),4))
    f1_score_std.append(round(f1_ts.std().item(),4))

print(f1_score_mean)
print(f1_score_std)

[0.7312, 0.7553, 0.7644, 0.7821, 0.7859, 0.7862, 0.7882, 0.79, 0.7909, 0.7947, 0.8015, 0.8024, 0.7991, 0.8009, 0.8021, 0.8041, 0.8053, 0.8065, 0.8038, 0.8065]
[0.0364, 0.0364, 0.0343, 0.0369, 0.0327, 0.0347, 0.0418, 0.0372, 0.0334, 0.0321, 0.0291, 0.0281, 0.029, 0.0301, 0.0295, 0.0293, 0.0277, 0.0296, 0.0277, 0.0265]


In [16]:
outputs = []
for i in range(len(sample_list)):
    outputs.append([sample_list[i],f1_score_mean[i],f1_score_std])
outputs = pd.DataFrame(data=outputs, columns=['sample', 'mean','std'])
outputs.to_csv('../log/SD_3CL.csv', index=False)