In [18]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,StratifiedKFold
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import random
import re
import torch
import joblib

In [19]:
fname = './data/processed/FGFR4_processed_ac.csv'
df = pd.read_csv(fname)

In [20]:
p_mask = df['label'] == 1
n_mask = df['label'] == 0
print(sum(p_mask))
print(sum(n_mask))

59
48


In [21]:
seed = 10
random.seed(seed)
np.random.seed(seed)
df_mix = pd.concat([df[p_mask].sample(48),df[n_mask].sample(48)])
# df_mix = pd.concat([df[p_mask],df[n_mask]])

In [22]:
df_mix = df_mix.sample(frac=1)

In [23]:
smiles_list = df_mix['smiles'].values
y = df_mix['label'].values
sum(y==1)

48

In [24]:
x = []
for smiles in smiles_list:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x.append(fp_bits)
x = np.array(x)
x.shape

(96, 167)

In [25]:
for s in range(116,117):
    sfolder = StratifiedKFold(n_splits=3,random_state=s,shuffle=True)
    splits = list(sfolder.split(x,y))

    f1_list = []
    precision_list =  []
    for train, test in sfolder.split(x,y):
        regr = make_pipeline(SVC(probability = True))
        regr.fit(x[train], y[train])
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')
        f1_list.append(f1)
        precision_list.append(p_score)
        print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    print("=======>",s)
    print(f"F1 mean: {f1_ts.mean()}, std: {f1_ts.std()}.")
    print(f"Precision mean: {precision_ts.mean()}, std: {precision_ts.std()}.")


f1: 0.46875; precision: 0.46875.
f1: 0.5; precision: 0.5.
f1: 0.625; precision: 0.625.
F1 mean: 0.53125, std: 0.08267972618341446.
Precision mean: 0.53125, std: 0.08267972618341446.


In [None]:
idx = 2
train = splits[idx][0]
test = splits[idx][1]

regr = make_pipeline(SVC(probability = True))
regr.fit(x[train], y[train])
pred = regr.predict(x[test])
f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
p_score = precision_score(y[test], pred, average='micro')
f1_list.append(f1)
precision_list.append(p_score)
print(f"f1: {f1}; precision: {p_score}.")

joblib.dump(regr, "SVC.m")

# 加入额外的数据

In [26]:
df_positive = pd.read_csv('./data/positive.csv')
smiles_positive = df_positive['smiles'].values
y_positive = df_positive['label'].values

df_negtive = pd.read_csv('./data/negtive.csv')
smiles_negtive = df_negtive['smiles'].values
y_negtive = df_negtive['label'].values

In [27]:
x_positive = []
y_positive = []
################# 正样本
for smiles in smiles_positive:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x_positive.append(fp_bits)
    y_positive.append(1)
x_positive = np.array(x_positive)
y_positive = np.array(y_positive)

x_negtive = []
y_negtive = []
################# 负样本
for smiles in smiles_negtive:
    mol=Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    # fp_bits = fp.ToBitString()
    fp_bits = fp.ToList()
    x_negtive.append(fp_bits)
    y_negtive.append(0)
x_negtive = np.array(x_negtive)
y_negtive = np.array(y_negtive)

print(len(x_positive))
print(len(x_negtive))

624
600


In [28]:
idx_positive = [i for i in range(len(x_positive))]
idx_negtive = [i for i in range(len(x_negtive))]

In [None]:
sample_list = [5,10,15,20,25,30,35,40,45,50]

f1_score_mean = []
f1_score_std = []
for sample_num in sample_list:
    random.seed(101)

    idx_p = random.sample(idx_positive, sample_num)
    idx_n = random.sample(idx_negtive, sample_num)

    x_extra = np.concatenate([x_positive[idx_p],x_negtive[idx_n]])
    y_extra = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])

    idx_s = [i for i in range(len(y_extra))]
    random.shuffle(idx_s)
    x_extra = x_extra[idx_s]
    y_extra = y_extra[idx_s]

    f1_list = []
    precision_list =  []
    for train, test in sfolder.split(x,y):
        regr = make_pipeline(SVC(probability = True))
        regr.fit(np.concatenate([x[train], x_extra]), np.concatenate([y[train], y_extra]))
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')

        f1_list.append(f1)
        precision_list.append(p_score)
        # print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    f1_score_mean.append(round(f1_ts.mean().item(),4))
    f1_score_std.append(round(f1_ts.std().item(),4))

f1_score_mean

In [29]:
sample_list = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]

f1_score_mean = []
f1_score_std = []

idx = 1
train = splits[idx][0]
test = splits[idx][1]

for sample_num in sample_list:
    f1_list = []
    precision_list =  []
    for seed in range(0,100):

        random.seed(seed)

        idx_p = random.sample(idx_positive, sample_num)
        idx_n = random.sample(idx_negtive, sample_num)

        x_extra = np.concatenate([x_positive[idx_p],x_negtive[idx_n]])
        y_extra = np.concatenate([y_positive[idx_p],y_negtive[idx_n]])

        idx_s = [i for i in range(len(y_extra))]
        random.shuffle(idx_s)
        x_extra = x_extra[idx_s]
        y_extra = y_extra[idx_s]

        regr = make_pipeline(SVC(probability = True))
        regr.fit(np.concatenate([x[train], x_extra]), np.concatenate([y[train], y_extra]))
        pred = regr.predict(x[test])
        f1 = f1_score(y[test].reshape(-1,1), pred.reshape(-1,1), average='micro')
        p_score = precision_score(y[test], pred, average='micro')

        f1_list.append(f1)
        precision_list.append(p_score)
        # print(f"f1: {f1}; precision: {p_score}.")

    f1_ts = torch.Tensor(f1_list)
    precision_ts = torch.Tensor(precision_list)

    f1_score_mean.append(round(f1_ts.mean().item(),4))
    f1_score_std.append(round(f1_ts.std().item(),4))

print(f1_score_mean)
print(f1_score_std)

[0.6212, 0.6953, 0.7056, 0.7163, 0.7234, 0.7325, 0.7444, 0.7472, 0.75, 0.7553, 0.7603, 0.7656, 0.7694, 0.77, 0.7759, 0.7803, 0.7759, 0.7841, 0.7897, 0.7925]
[0.0595, 0.0548, 0.0503, 0.0498, 0.0452, 0.0467, 0.0429, 0.0433, 0.0421, 0.0407, 0.041, 0.0436, 0.0441, 0.0425, 0.0385, 0.0386, 0.0412, 0.0402, 0.0404, 0.0348]


In [30]:
outputs = []
for i in range(len(sample_list)):
    outputs.append([sample_list[i],f1_score_mean[i],f1_score_std])
outputs = pd.DataFrame(data=outputs, columns=['sample', 'mean','std'])
outputs.to_csv('../log/SD_FGFR4.csv', index=False)