In [1]:
from sklearn.ensemble import RandomForestClassifier
#import deepchem as dc
import numpy as np
import pandas as pd
import tempfile
#import chemprop
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.linear_model import LogisticRegression
import xgboost
import os
from sklearn.svm import SVC
from tensorflow.keras.callbacks import EarlyStopping
import joblib

In [2]:
def ROC_AUC(y_true, y_score):
	auc = roc_auc_score(y_true, y_score)
	return auc

In [3]:
fingerprint_list = ['MorganFP', 'RDkitFP', 'AtomPairFP', 'TorsionFP', 'AvalonFP', 
'EstateFP', 'MACCSFP', 'PharmacoErGFP', 'PharmacoPFP', 'PubChemFP', 'MHFP6', 'MAP4']
descriptor_list = ['Property', 'Constitution', 'Autocorr', 'Fragment', 'Charge', 
'Estate', 'MOE', 'Connectivity', 'Topology', 'Kappa', 'Path', 'Matrix', 'InfoContent']

In [4]:
def get_Xye_for_one_fingerprint(feature):
    #feature='AtomPairFP'
    csv_file = "Main.csv"
    df = pd.read_csv(csv_file)
    smiles_col = df.columns[0]
    values_col = df.columns[1]

    csv_feat_file = f"fp/Main_{feature}.csv"
    dff = pd.read_csv(csv_feat_file)

    print('values_col = ',values_col)
    #feats_col = df.columns[2:]

    MASK = -1
    #y = df[values_col].astype('int').fillna(MASK).values
    y = df[values_col].astype('int').fillna(MASK).values
    #X = df[feats_col].values
    X = dff.values
    n_feats = len(dff.columns)

    #read external set
    csv_file_e = "Ext.csv"
    df_e = pd.read_csv(csv_file_e)
    smiles_col = df_e.columns[0]
    values_col = df_e.columns[1]
    csv_feat_file_e = f"fp/Ext_{feature}.csv"
    df_ef = pd.read_csv(csv_feat_file_e)

    MASK = -1
    y_e = df_e[values_col].astype('int').fillna(MASK).values
    X_e = df_ef.values
    return X,y,X_e,y_e

In [5]:
def get_Xye_for_one_descriptor(feature):
    #feature='AtomPairFP'
    csv_file = "Main.csv"
    df = pd.read_csv(csv_file)
    smiles_col = df.columns[0]
    values_col = df.columns[1]

    csv_feat_file = f"fp/Main_{feature}S.csv"
    dff = pd.read_csv(csv_feat_file)

    print('values_col = ',values_col)
    #feats_col = df.columns[2:]

    MASK = -1
    #y = df[values_col].astype('int').fillna(MASK).values
    y = df[values_col].astype('int').fillna(MASK).values
    #X = df[feats_col].values
    X = dff.values
    n_feats = len(dff.columns)

    #read external set
    csv_file_e = "Ext.csv"
    df_e = pd.read_csv(csv_file_e)
    smiles_col = df_e.columns[0]
    values_col = df_e.columns[1]
    csv_feat_file_e = f"fp/Ext_{feature}S.csv"
    df_ef = pd.read_csv(csv_feat_file_e)

    MASK = -1
    y_e = df_e[values_col].astype('int').fillna(MASK).values
    X_e = df_ef.values
    return X,y,X_e,y_e

# run xgboost model

In [7]:
def run_xgb_with_save(feature,num_folds):   
    for i in range(num_folds): 
        #print(f'processing fold: {i}')
        pklf = f'./rand_MorganFP/get_split/split_indices_fold{i}.pckl'
        train_idx, valid_idx, test_idx = pd.read_pickle(pklf)
        print(len(train_idx), len(valid_idx), len(test_idx))
        #get X and Y (main set and external set)
        X_T = X[train_idx];y_T = y[train_idx]
        X_valid = X[valid_idx];y_valid = y[valid_idx]
        X_test = X[test_idx];y_test = y[test_idx] 
        xgboost_model = xgboost.XGBClassifier(gpu_id = 0,tree_method = 'gpu_hist',
  max_depth=5,learning_rate=0.05, n_estimators=3000, gamma=0.0, min_child_weight=5,
  max_delta_step=1, subsample=0.53, colsample_bytree=0.66, colsample_bylevel=1, reg_alpha=0,
  reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=2016)
        
        xgboost_model.fit(X_T, y_T)
        
        y_prob_T =  xgboost_model.predict_proba(X_T)
        train_roc_auc = ROC_AUC(y_T,y_prob_T[:, 1])
        #print('train_roc_auc = ', train_roc_auc)
        y_prob_test =  xgboost_model.predict_proba(X_test)
        test_roc_auc = ROC_AUC(y_test,y_prob_test[:, 1])
        y_prob_valid =  xgboost_model.predict_proba(X_valid)
        valid_roc_auc = ROC_AUC(y_valid,y_prob_valid[:, 1])
        #print('valid_roc_auc, test_roc_auc = ', valid_roc_auc, test_roc_auc)

        y_prob_e =  xgboost_model.predict_proba(X_e)
        e_roc_auc = ROC_AUC(y_e,y_prob_e[:, 1])
        print('e_roc_auc = ', e_roc_auc)
        if not os.path.exists('xgb'):os.mkdir('xgb')
        pd.DataFrame(y_prob_T[:,1],columns=['prob']).to_csv(f'xgb/p_train_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_valid[:, 1],columns=['prob']).to_csv(f'xgb/p_val_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_test[:,1],columns=['prob']).to_csv(f'xgb/p_test_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_e[:,1],columns=['prob']).to_csv(f'xgb/p_Ext_{feature}_fold{i}.csv',index=False) 
        ## save model
        model_file=f"xgb/model_{feature}_fold{i}.job"
        joblib.dump(xgboost_model,model_file,compress=3) 
        ## free gpu memory
        #xgboost_model = ''
        #free_memory_by_numba()
    return

In [8]:
num_folds = 5;
for feature in fingerprint_list:
    X,y,X_e,y_e = get_Xye_for_one_fingerprint(feature)
    print(f'processing feat: {feature}')
    run_xgb_with_save(feature,num_folds)

values_col =  label
processing feat: MorganFP
2769 346 347
e_roc_auc =  0.8363290026374914
2769 346 347
e_roc_auc =  0.8626062322946175
2769 346 347
e_roc_auc =  0.8580476050926378
2769 346 347
e_roc_auc =  0.8632900263749146
2769 346 347
e_roc_auc =  0.8700302823092704
values_col =  label
processing feat: RDkitFP
2769 346 347
e_roc_auc =  0.8046791052066035
2769 346 347
e_roc_auc =  0.8310377389209078
2769 346 347
e_roc_auc =  0.8193806779329882
2769 346 347
e_roc_auc =  0.7831395916772491
2769 346 347
e_roc_auc =  0.7927778320471491
values_col =  label
processing feat: AtomPairFP
2769 346 347
e_roc_auc =  0.9047735339129303
2769 346 347
e_roc_auc =  0.917244635472632
2769 346 347
e_roc_auc =  0.8967959363094657
2769 346 347
e_roc_auc =  0.9051317117645143
2769 346 347
e_roc_auc =  0.8878089283969913
values_col =  label
processing feat: TorsionFP
2769 346 347
e_roc_auc =  0.8144475920679888
2769 346 347
e_roc_auc =  0.8035394484061087
2769 346 347
e_roc_auc =  0.8317052521897692
2769 

In [10]:
num_folds = 5;
for feature in descriptor_list:
    X,y,X_e,y_e = get_Xye_for_one_descriptor(feature)
    print(f'processing feat: {feature}')
    run_xgb_with_save(feature,num_folds)

values_col =  label
processing feat: Property
2769 346 347
e_roc_auc =  0.8150337012796718
2769 346 347
e_roc_auc =  0.8109634984207612
2769 346 347
e_roc_auc =  0.8074794047735339
2769 346 347
e_roc_auc =  0.7902868678974959
2769 346 347
e_roc_auc =  0.7785972452867052
values_col =  label
processing feat: Constitution
2769 346 347
e_roc_auc =  0.8180130897723943
2769 346 347
e_roc_auc =  0.8201621568818991
2769 346 347
e_roc_auc =  0.8650320731985282
2769 346 347
e_roc_auc =  0.8588779264758556
2769 346 347
e_roc_auc =  0.7887076291882388
values_col =  label
processing feat: Autocorr
2769 346 347
e_roc_auc =  0.8937677053824362
2769 346 347
e_roc_auc =  0.8822734525088731
2769 346 347
e_roc_auc =  0.8945166227084759
2769 346 347
e_roc_auc =  0.8442089153723422
2769 346 347
e_roc_auc =  0.836166194523135
values_col =  label
processing feat: Fragment
2769 346 347
e_roc_auc =  0.7331575005698284
2769 346 347
e_roc_auc =  0.6938230601413174
2769 346 347
e_roc_auc =  0.7201328514213148
276

# run fcnn model

In [11]:
# seem the keras model can only run once inside a function, otherwise will get GPU errors.
def run_one_keras_fcnn(feature,i):   
    model = Sequential()
    model.add(Dense(800, input_shape=(n_feats,), activation='relu'),)
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    #print(model.summary)
    pklf = f'./rand_MorganFP/get_split/split_indices_fold{i}.pckl'
    train_idx, valid_idx, test_idx = pd.read_pickle(pklf)
    print(len(train_idx), len(valid_idx), len(test_idx))
    #get X and Y (main set and external set)
    X_T = X[train_idx];y_T = y[train_idx]
    X_valid = X[valid_idx];y_valid = y[valid_idx]
    X_test = X[test_idx];y_test = y[test_idx]     
    y_T_2dim=y_T[:,np.newaxis]  ### change shape from(2769,) to (2769, 1)
    y_valid_2dim = y_valid[:,np.newaxis]
    print(y_T_2dim.shape)
    #model.fit(X_T, y_T_2dim,batch_size=4096,verbose=0, epochs=100)
    ### using early stop by monitor val_loss of X_valid, y_valid_2dim
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    model.fit(X_T, y_T_2dim, batch_size=4096, verbose=0, epochs=100, 
              validation_data=(X_valid, y_valid_2dim), callbacks=[early_stopping])    
    ## prediction
    y_prob_T = model.predict(X_T) 
    train_roc_auc = ROC_AUC(y_T,y_prob_T[:, 0])
    print('train_roc_auc = ', train_roc_auc )
    y_prob_valid=model.predict(X_valid)  ## no need other parameters here
    valid_roc_auc = ROC_AUC(y_valid,y_prob_valid[:, 0])
    y_prob_test=model.predict(X_test)  
    test_roc_auc = ROC_AUC(y_test,y_prob_test[:, 0])
    print('test_roc_auc, valid_roc_auc = ', test_roc_auc, valid_roc_auc)
    y_prob_e=model.predict(X_e)  ## no need other parameters here
    e_roc_auc = ROC_AUC(y_e,y_prob_e[:, 0])
    print('e_roc_auc = ', e_roc_auc)    
    ## save predition
    if not os.path.exists('fcnn'):os.mkdir('fcnn')
    pd.DataFrame(y_prob_T[:,0],columns=['prob']).to_csv(f'fcnn/p_train_{feature}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_valid[:, 0],columns=['prob']).to_csv(f'fcnn/p_val_{feature}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_test[:,0],columns=['prob']).to_csv(f'fcnn/p_test_{feature}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_e[:,0],columns=['prob']).to_csv(f'fcnn/p_Ext_{feature}_fold{i}.csv',index=False) 
    ## save model
    model_save_name = f'fcnn/model_{feature}_fold{i}.h5'
    model.save(model_save_name)    
    model = ''
    ## empty gpu memory
    import torch
    torch.cuda.empty_cache()
    return    

In [12]:
os.environ["CUDA_VISIBLE_DEVICES"]="2"
num_folds = 5
for feature in fingerprint_list:
    csv_feat_file = f"fp/Main_{feature}.csv"
    tdf = pd.read_csv(csv_feat_file,nrows=3)
    n_feats = len(tdf.columns)
    print(f'processing feat: {feature}')
    X,y,X_e,y_e = get_Xye_for_one_fingerprint(feature)
    for i in range(num_folds): 
        run_one_keras_fcnn(feature,i)

processing feat: MorganFP
values_col =  label


2023-01-23 01:25:53.092025: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-23 01:25:54.196618: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8037 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:81:00.0, compute capability: 8.6


2769 346 347
(2769, 1)


2023-01-23 01:25:57.693648: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


train_roc_auc =  0.9946412903166532
test_roc_auc, valid_roc_auc =  0.9484313725490197 0.9276496622408573
e_roc_auc =  0.8613688906255088
2769 346 347
(2769, 1)
train_roc_auc =  0.9972713935399131
test_roc_auc, valid_roc_auc =  0.9013970588235294 0.9433477772428182
e_roc_auc =  0.8419621633942236
2769 346 347
(2769, 1)
train_roc_auc =  0.9948595314060255
test_roc_auc, valid_roc_auc =  0.9548100048100048 0.9490013910969793
e_roc_auc =  0.8425157109830355
2769 346 347
(2769, 1)
train_roc_auc =  0.9960286287858952
test_roc_auc, valid_roc_auc =  0.9070458801498127 0.9112573099415205
e_roc_auc =  0.8240858324378888
2769 346 347
(2769, 1)
train_roc_auc =  0.996082680190564
test_roc_auc, valid_roc_auc =  0.9458401231214919 0.9289052890528905
e_roc_auc =  0.8300120478004624
processing feat: RDkitFP
values_col =  label
2769 346 347
(2769, 1)
train_roc_auc =  0.9697909345240556
test_roc_auc, valid_roc_auc =  0.9517647058823528 0.9226648031679479
e_roc_auc =  0.7915079287551692
2769 346 347
(2769,

e_roc_auc =  0.5847904659568233
2769 346 347
(2769, 1)
train_roc_auc =  0.9875567780510972
test_roc_auc, valid_roc_auc =  0.9575998075998077 0.9440828696343402
e_roc_auc =  0.5798410992803882
2769 346 347
(2769, 1)
train_roc_auc =  0.9859424790482796
test_roc_auc, valid_roc_auc =  0.9023174157303371 0.9028265107212475
e_roc_auc =  0.5602552831233109
2769 346 347
(2769, 1)
train_roc_auc =  0.990748424773321
test_roc_auc, valid_roc_auc =  0.9446179612529423 0.928019680196802
e_roc_auc =  0.5357526619126697
processing feat: EstateFP
values_col =  label
2769 346 347
(2769, 1)
train_roc_auc =  0.852329993614568
test_roc_auc, valid_roc_auc =  0.857328431372549 0.8302119729792686
e_roc_auc =  0.6243691185568688
2769 346 347
(2769, 1)
train_roc_auc =  0.851098145481494
test_roc_auc, valid_roc_auc =  0.7996078431372549 0.8234098325576419
e_roc_auc =  0.6026830777245938
2769 346 347
(2769, 1)
train_roc_auc =  0.8365465701794979
test_roc_auc, valid_roc_auc =  0.8988696488696488 0.8857810015898252

In [13]:
for feature in descriptor_list:
    csv_feat_file = f"fp/Main_{feature}S.csv"
    tdf = pd.read_csv(csv_feat_file,nrows=3)
    n_feats = len(tdf.columns)
    print(f'processing feat: {feature}')
    X,y,X_e,y_e = get_Xye_for_one_descriptor(feature)
    for i in range(num_folds): 
        run_one_keras_fcnn(feature,i)

processing feat: Property
values_col =  label
2769 346 347
(2769, 1)
train_roc_auc =  0.8487169416077508
test_roc_auc, valid_roc_auc =  0.8755637254901959 0.805008152806895
e_roc_auc =  0.7255218000065123
2769 346 347
(2769, 1)
train_roc_auc =  0.8480498334066383
test_roc_auc, valid_roc_auc =  0.8148774509803922 0.8659391215963195
e_roc_auc =  0.7244472664517599
2769 346 347
(2769, 1)
train_roc_auc =  0.8416584975279476
test_roc_auc, valid_roc_auc =  0.8703463203463203 0.8493392289348172
e_roc_auc =  0.7258474162352253
2769 346 347
(2769, 1)
train_roc_auc =  0.8578686285408906
test_roc_auc, valid_roc_auc =  0.8106507490636704 0.7860136452241715
e_roc_auc =  0.7189769138093842
2769 346 347
(2769, 1)
train_roc_auc =  0.852552251421546
test_roc_auc, valid_roc_auc =  0.8416621401412276 0.8023616236162361
e_roc_auc =  0.7091107420793852
processing feat: Constitution
values_col =  label
2769 346 347
(2769, 1)
train_roc_auc =  0.9341913853324942
test_roc_auc, valid_roc_auc =  0.90654411764705

test_roc_auc, valid_roc_auc =  0.8605098605098604 0.8325467011128775
e_roc_auc =  0.6889062550877536
2769 346 347
(2769, 1)
train_roc_auc =  0.7870512013646755
test_roc_auc, valid_roc_auc =  0.7676498127340824 0.7096003898635478
e_roc_auc =  0.7521409267037868
2769 346 347
(2769, 1)
train_roc_auc =  0.7837832334409097
test_roc_auc, valid_roc_auc =  0.7808482708672823 0.7540959409594096
e_roc_auc =  0.7653609455895282
processing feat: Kappa
values_col =  label
2769 346 347
(2769, 1)
train_roc_auc =  0.7854850861114556
test_roc_auc, valid_roc_auc =  0.799485294117647 0.7470300489168413
e_roc_auc =  0.7287942431050762
2769 346 347
(2769, 1)
train_roc_auc =  0.7742539449187519
test_roc_auc, valid_roc_auc =  0.7736274509803921 0.8149842186914888
e_roc_auc =  0.7267754224870567
2769 346 347
(2769, 1)
train_roc_auc =  0.7787634866898854
test_roc_auc, valid_roc_auc =  0.8335978835978836 0.7762321144674085
e_roc_auc =  0.7092898310051774
2769 346 347
(2769, 1)
train_roc_auc =  0.781992239479732

free GPU memory

In [14]:
# free GPU memory
from numba import cuda 
device = cuda.get_current_device()
device.reset()
cuda.close()

LogisticRegression

#  Log_model.predict(X_T) will output 0,1;  while Log_model.predict_proba will output proba with 2 colums. 

In [15]:
def run_log_with_save(feature,num_folds):  
    for i in range(num_folds): 
        #print(f'processing fold: {i}')
        pklf = f'./rand_MorganFP/get_split/split_indices_fold{i}.pckl'
        train_idx, valid_idx, test_idx = pd.read_pickle(pklf)
        print(len(train_idx), len(valid_idx), len(test_idx))
        #get X and Y (main set and external set)
        X_T = X[train_idx];y_T = y[train_idx]
        X_valid = X[valid_idx];y_valid = y[valid_idx]
        X_test = X[test_idx];y_test = y[test_idx] 
        #Log_model = LogisticRegression(
        #  penalty= 'l2', C=1.0, class_weight="balanced", dual=False, fit_intercept=True,
        #  max_iter=100,  multi_class='ovr',  n_jobs=-1,
        #  random_state=None,solver='liblinear',tol=0.0001,verbose=0,warm_start=False)
        # following is the parameters from deepchem 
        Log_model = LogisticRegression( penalty='l2', C=1. / 1. , class_weight="balanced",  n_jobs=2)
        Log_model.fit(X_T, y_T)
        y_prob_T = Log_model.predict_proba(X_T)[:,1]
        train_roc_auc = ROC_AUC(y_T,y_prob_T)
        #print('train_roc_auc = ', train_roc_auc )
        y_prob_valid=Log_model.predict_proba(X_valid)[:,1]  ## no need other parameters here
        valid_roc_auc = ROC_AUC(y_valid,y_prob_valid)
        y_prob_test=Log_model.predict_proba(X_test)[:,1]
        test_roc_auc = ROC_AUC(y_test,y_prob_test)
        #print('test_roc_auc, valid_roc_auc = ', test_roc_auc, valid_roc_auc)
        y_prob_e=Log_model.predict_proba(X_e)[:,1]  ## no need other parameters here
        e_roc_auc = ROC_AUC(y_e,y_prob_e)
        print('e_roc_auc = ', e_roc_auc)
        if not os.path.exists('log'):os.mkdir('log')
        pd.DataFrame(y_prob_T,columns=['prob']).to_csv(f'log/p_train_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_valid,columns=['prob']).to_csv(f'log/p_val_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_test,columns=['prob']).to_csv(f'log/p_test_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_e,columns=['prob']).to_csv(f'log/p_Ext_{feature}_fold{i}.csv',index=False)
        model_file=f"log/model_{feature}_fold{i}.job"
        joblib.dump(Log_model,model_file,compress=3)
    return

In [16]:
num_folds = 5
for feature in fingerprint_list:
    X,y,X_e,y_e = get_Xye_for_one_fingerprint(feature)
    print(f'processing feat: {feature}')
    run_log_with_save(feature,num_folds)    
for feature in descriptor_list:    
    X,y,X_e,y_e = get_Xye_for_one_descriptor(feature)
    print(f'processing feat: {feature}')
    run_log_with_save(feature,num_folds)

values_col =  label
processing feat: MorganFP
2769 346 347
e_roc_auc =  0.7922568460812087
2769 346 347
e_roc_auc =  0.8105239165119988
2769 346 347
e_roc_auc =  0.834847448796848
2769 346 347
e_roc_auc =  0.7827162905799225
2769 346 347
e_roc_auc =  0.8247533457067499
values_col =  label
processing feat: RDkitFP
2769 346 347
e_roc_auc =  0.6560190159877567
2769 346 347
e_roc_auc =  0.6656084139233499
2769 346 347
e_roc_auc =  0.5689980788642507
2769 346 347
e_roc_auc =  0.6137377486893947
2769 346 347
e_roc_auc =  0.6269740483865717
values_col =  label
processing feat: AtomPairFP
2769 346 347
e_roc_auc =  0.8977565041841686
2769 346 347
e_roc_auc =  0.9256618149848589
2769 346 347
e_roc_auc =  0.9070040050796131
2769 346 347
e_roc_auc =  0.9207612907427306
2769 346 347
e_roc_auc =  0.8739051154309532
values_col =  label
processing feat: TorsionFP
2769 346 347
e_roc_auc =  0.6600892188466673
2769 346 347
e_roc_auc =  0.6608055745498356
2769 346 347
e_roc_auc =  0.6302464914851356
2769 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

e_roc_auc =  0.6257367067174627
values_col =  label
processing feat: PharmacoErGFP
2769 346 347
e_roc_auc =  0.8189248152127901
2769 346 347
e_roc_auc =  0.8100843346032366
2769 346 347


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

e_roc_auc =  0.7967991924717528
2769 346 347
e_roc_auc =  0.825941844941552
2769 346 347
e_roc_auc =  0.7952362345739311
values_col =  label
processing feat: PharmacoPFP
2769 346 347
e_roc_auc =  0.5556152518641528
2769 346 347
e_roc_auc =  0.5809970368923187
2769 346 347
e_roc_auc =  0.5049330858649995
2769 346 347
e_roc_auc =  0.489450034189704
2769 346 347
e_roc_auc =  0.5405717820976197
values_col =  label
processing feat: PubChemFP
2769 346 347
e_roc_auc =  0.638321773957214
2769 346 347
e_roc_auc =  0.6293347660447397
2769 346 347
e_roc_auc =  0.6205105662466218
2769 346 347
e_roc_auc =  0.6188010810458793
2769 346 347
e_roc_auc =  0.6470482888867181
values_col =  label
processing feat: MHFP6
2769 346 347
e_roc_auc =  0.8079841099280388
2769 346 347
e_roc_auc =  0.757904333952004
2769 346 347
e_roc_auc =  0.7830419068086353
2769 346 347
e_roc_auc =  0.8116310116896225
2769 346 347
e_roc_auc =  0.7400931262414119
values_col =  label
processing feat: MAP4
2769 346 347
e_roc_auc =  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

processing feat: Constitution
2769 346 347
e_roc_auc =  0.7204259060271563
2769 346 347
e_roc_auc =  0.727442935755918
2769 346 347


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

e_roc_auc =  0.731675946729185
2769 346 347
e_roc_auc =  0.7245612321318096
2769 346 347
e_roc_auc =  0.6906645827228028
values_col =  label
processing feat: Autocorr
2769 346 347
e_roc_auc =  0.8279606655595714
2769 346 347
e_roc_auc =  0.803474325160366
2769 346 347
e_roc_auc =  0.7963107681286835
2769 346 347
e_roc_auc =  0.7762853700628439
2769 346 347
e_roc_auc =  0.8291328839829377
values_col =  label
processing feat: Fragment
2769 346 347
e_roc_auc =  0.44858519748624276
2769 346 347
e_roc_auc =  0.436569958646739
2769 346 347
e_roc_auc =  0.4192960177135229
2769 346 347
e_roc_auc =  0.42401745302985894
2769 346 347
e_roc_auc =  0.428429552928918
values_col =  label
processing feat: Charge
2769 346 347
e_roc_auc =  0.7746898505421511
2769 346 347
e_roc_auc =  0.7888378756797239
2769 346 347
e_roc_auc =  0.7742014261990817
2769 346 347
e_roc_auc =  0.7703754355117058
2769 346 347
e_roc_auc =  0.7898310051772981
values_col =  label
processing feat: Estate
2769 346 347
e_roc_auc = 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

e_roc_auc =  0.7655237537038846
2769 346 347
e_roc_auc =  0.7537038846016085
values_col =  label
processing feat: Topology
2769 346 347
e_roc_auc =  0.6893946794308228
2769 346 347
e_roc_auc =  0.6729673406922602
2769 346 347
e_roc_auc =  0.6733417993552798
2769 346 347
e_roc_auc =  0.6815147666959721
2769 346 347
e_roc_auc =  0.6702484451825079
values_col =  label
processing feat: Kappa
2769 346 347
e_roc_auc =  0.723258767216958
2769 346 347


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

e_roc_auc =  0.7233890137084433
2769 346 347
e_roc_auc =  0.7162905799225032
2769 346 347
e_roc_auc =  0.7170720588714142
2769 346 347
e_roc_auc =  0.7089967763993358
values_col =  label
processing feat: Path
2769 346 347
e_roc_auc =  0.726807984109928
2769 346 347
e_roc_auc =  0.7123180619322067
2769 346 347
e_roc_auc =  0.7094526391195337
2769 346 347
e_roc_auc =  0.6928136498323076
2769 346 347
e_roc_auc =  0.7041450945915144
values_col =  label
processing feat: Matrix
2769 346 347
e_roc_auc =  0.7769040408973983
2769 346 347
e_roc_auc =  0.766370355898538
2769 346 347
e_roc_auc =  0.769870730357201
2769 346 347
e_roc_auc =  0.7413141870990849
2769 346 347
e_roc_auc =  0.7603952981016574
values_col =  label
processing feat: InfoContent
2769 346 347
e_roc_auc =  0.7708964214776464
2769 346 347
e_roc_auc =  0.7478427924847774
2769 346 347
e_roc_auc =  0.7538341310930937
2769 346 347
e_roc_auc =  0.737488196411709
2769 346 347
e_roc_auc =  0.7449610888606689


# svm

In [17]:
def run_svm_with_save(feature,num_folds):  
    for i in range(num_folds): 
        #print(f'processing fold: {i}')
        pklf = f'./rand_MorganFP/get_split/split_indices_fold{i}.pckl'
        train_idx, valid_idx, test_idx = pd.read_pickle(pklf)
        print(len(train_idx), len(valid_idx), len(test_idx))
        #get X and Y (main set and external set)
        X_T = X[train_idx];y_T = y[train_idx]
        X_valid = X[valid_idx];y_valid = y[valid_idx]
        X_test = X[test_idx];y_test = y[test_idx] 
        svm_clf = svm.SVC(C=1.0, coef0=0.0, degree=3, gamma='auto', kernel='linear', probability=True, shrinking=True)
        ## below is from deepchem
        #svm_clf = svm.SVC(C=1.0, gamma=0.05, class_weight="balanced", probability=True)  ## much slower 
        svm_clf.fit(X_T, y_T)
        #y_p_T = svm_clf.predict(X_T)  ## svm_clf.predict(X_T) is diff from svm_clf.predict_proba(X_T)
        y_prob_T = svm_clf.predict_proba(X_T)[:,1]
        train_roc_auc = ROC_AUC(y_T,y_prob_T)
        #print('train_roc_auc = ', train_roc_auc )
        y_prob_valid=svm_clf.predict_proba(X_valid)[:,1]  ## no need other parameters here
        valid_roc_auc = ROC_AUC(y_valid,y_prob_valid)
        y_prob_test=svm_clf.predict_proba(X_test)[:,1]  
        test_roc_auc = ROC_AUC(y_test,y_prob_test)
        #print('test_roc_auc, valid_roc_auc = ', test_roc_auc, valid_roc_auc)
        y_prob_e=svm_clf.predict_proba(X_e)[:,1]  ## no need other parameters here
        e_roc_auc = ROC_AUC(y_e,y_prob_e)
        print('e_roc_auc = ', e_roc_auc)
        if not os.path.exists('svm'):os.mkdir('svm')
        pd.DataFrame(y_prob_T,columns=['prob']).to_csv(f'svm/p_train_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_valid,columns=['prob']).to_csv(f'svm/p_val_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_test,columns=['prob']).to_csv(f'svm/p_test_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_e,columns=['prob']).to_csv(f'svm/p_Ext_{feature}_fold{i}.csv',index=False)
        model_file=f"svm/model_{feature}_fold{i}.job"
        joblib.dump(svm_clf,model_file,compress=3)
    return

In [18]:
from sklearn import svm
num_folds = 5;
for feature in fingerprint_list:
    X,y,X_e,y_e = get_Xye_for_one_fingerprint(feature)
    print(f'processing feat: {feature}')
    run_svm_with_save(feature,num_folds)

values_col =  label
processing feat: MorganFP
2769 346 347
e_roc_auc =  0.7301129888313633
2769 346 347
e_roc_auc =  0.8134707433818502
2769 346 347
e_roc_auc =  0.8417667936569958
2769 346 347
e_roc_auc =  0.7355670606623033
2769 346 347
e_roc_auc =  0.8167269056689784
values_col =  label
processing feat: RDkitFP
2769 346 347
e_roc_auc =  0.5739962879749927
2769 346 347
e_roc_auc =  0.5877698544495457
2769 346 347
e_roc_auc =  0.545602552831233
2769 346 347
e_roc_auc =  0.6196802448634039
2769 346 347
e_roc_auc =  0.5977337110481586
values_col =  label
processing feat: AtomPairFP
2769 346 347
e_roc_auc =  0.853716909250757
2769 346 347
e_roc_auc =  0.9062062453192666
2769 346 347
e_roc_auc =  0.8614340138712514
2769 346 347
e_roc_auc =  0.8965680049493667
2769 346 347
e_roc_auc =  0.8554752368858063
values_col =  label
processing feat: TorsionFP
2769 346 347
e_roc_auc =  0.6192081013317703
2769 346 347
e_roc_auc =  0.5924750089544464
2769 346 347
e_roc_auc =  0.6819380677932989
2769 3

In [19]:
for feature in descriptor_list:
    X,y,X_e,y_e = get_Xye_for_one_descriptor(feature)
    print(f'processing feat: {feature}')
    run_svm_with_save(feature,num_folds)

values_col =  label
processing feat: Property
2769 346 347
e_roc_auc =  0.8000879163817524
2769 346 347
e_roc_auc =  0.767461170264726
2769 346 347
e_roc_auc =  0.7773273419947251
2769 346 347
e_roc_auc =  0.7731268926443294
2769 346 347
e_roc_auc =  0.7657516850639836
values_col =  label
processing feat: Constitution
2769 346 347
e_roc_auc =  0.6987724268177524
2769 346 347
e_roc_auc =  0.7096154472338901
2769 346 347
e_roc_auc =  0.6952557715476541
2769 346 347
e_roc_auc =  0.689427241053694
2769 346 347
e_roc_auc =  0.6710299241314187
values_col =  label
processing feat: Autocorr
2769 346 347
e_roc_auc =  0.8512422259125394
2769 346 347
e_roc_auc =  0.8104425124548209
2769 346 347
e_roc_auc =  0.8316075673211553
2769 346 347
e_roc_auc =  0.8229461756373938
2769 346 347
e_roc_auc =  0.8370779199635309
values_col =  label
processing feat: Fragment
2769 346 347
e_roc_auc =  0.5707238448764287
2769 346 347
e_roc_auc =  0.5507961316792029
2769 346 347
e_roc_auc =  0.5231838754843542
2769

# rf

In [21]:
def run_rf_with_save(feature,num_folds):  
    for i in range(num_folds): 
        #print(f'processing fold: {i}')
        pklf = f'./rand_MorganFP/get_split/split_indices_fold{i}.pckl'
        train_idx, valid_idx, test_idx = pd.read_pickle(pklf)
        print(len(train_idx), len(valid_idx), len(test_idx))
        #get X and Y (main set and external set)
        X_T = X[train_idx];y_T = y[train_idx]
        X_valid = X[valid_idx];y_valid = y[valid_idx]
        X_test = X[test_idx];y_test = y[test_idx] 
        #rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=1)  ## Yang JM's paramters
        #rf_clf = RandomForestClassifier(class_weight="balanced", n_estimators=50, n_jobs=-1) # dc parameters
        rf_clf = RandomForestClassifier(class_weight="balanced", n_estimators=500, n_jobs=-1)  ## my parameters
        rf_clf.fit(X_T, y_T)

        y_prob_T = rf_clf.predict_proba(X_T)[:,1]
        train_roc_auc = ROC_AUC(y_T,y_prob_T)
        #print('train_roc_auc = ', train_roc_auc )
        y_prob_valid=rf_clf.predict_proba(X_valid)[:,1]  ## no need other parameters here
        valid_roc_auc = ROC_AUC(y_valid,y_prob_valid)
        y_prob_test=rf_clf.predict_proba(X_test)[:,1]  
        test_roc_auc = ROC_AUC(y_test,y_prob_test)
        #print('test_roc_auc, valid_roc_auc = ', test_roc_auc, valid_roc_auc)
        y_prob_e=rf_clf.predict_proba(X_e)[:,1]  ## no need other parameters here
        e_roc_auc = ROC_AUC(y_e,y_prob_e)
        print('e_roc_auc = ', e_roc_auc)

        if not os.path.exists('rf'):os.mkdir('rf')
        pd.DataFrame(y_prob_T,columns=['prob']).to_csv(f'rf/p_train_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_valid,columns=['prob']).to_csv(f'rf/p_val_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_test,columns=['prob']).to_csv(f'rf/p_test_{feature}_fold{i}.csv',index=False)
        pd.DataFrame(y_prob_e,columns=['prob']).to_csv(f'rf/p_Ext_{feature}_fold{i}.csv',index=False)
        model_file=f"rf/model_{feature}_fold{i}.job"
        joblib.dump(rf_clf,model_file,compress=3)
    return

In [22]:
num_folds = 5;
for feature in fingerprint_list:
    X,y,X_e,y_e = get_Xye_for_one_fingerprint(feature)
    print(f'processing feat: {feature}')
    run_rf_with_save(feature,num_folds)

values_col =  label
processing feat: MorganFP
2769 346 347
e_roc_auc =  0.8848946631500114
2769 346 347
e_roc_auc =  0.8912441796099118
2769 346 347
e_roc_auc =  0.887597277848328
2769 346 347
e_roc_auc =  0.8874995929797139
2769 346 347
e_roc_auc =  0.8827130344176354
values_col =  label
processing feat: RDkitFP
2769 346 347
e_roc_auc =  0.872228191853082
2769 346 347
e_roc_auc =  0.8716095210185275
2769 346 347
e_roc_auc =  0.8668718048907558
2769 346 347
e_roc_auc =  0.871137377486894
2769 346 347
e_roc_auc =  0.8534238546449155
values_col =  label
processing feat: AtomPairFP
2769 346 347
e_roc_auc =  0.9047246914786233
2769 346 347
e_roc_auc =  0.8936863013252581
2769 346 347
e_roc_auc =  0.8986845104360002
2769 346 347
e_roc_auc =  0.8925303637133275
2769 346 347
e_roc_auc =  0.8859854775161995
values_col =  label
processing feat: TorsionFP
2769 346 347
e_roc_auc =  0.8636807658493699
2769 346 347
e_roc_auc =  0.8682556738627853
2769 346 347
e_roc_auc =  0.8664647846048648
2769 34

In [23]:
for feature in descriptor_list:    
    X,y,X_e,y_e = get_Xye_for_one_descriptor(feature)
    print(f'processing feat: {feature}')
    run_rf_with_save(feature,num_folds)

values_col =  label
processing feat: Property
2769 346 347
e_roc_auc =  0.8247859073296213
2769 346 347
e_roc_auc =  0.8218879228940771
2769 346 347
e_roc_auc =  0.7949106183452184
2769 346 347
e_roc_auc =  0.8169548370290776
2769 346 347
e_roc_auc =  0.7980690957637329
values_col =  label
processing feat: Constitution
2769 346 347
e_roc_auc =  0.8087655888769496
2769 346 347
e_roc_auc =  0.8130637230959592
2769 346 347
e_roc_auc =  0.8102308619061573
2769 346 347
e_roc_auc =  0.8101657386604149
2769 346 347
e_roc_auc =  0.7985738009182377
values_col =  label
processing feat: Autocorr
2769 346 347
e_roc_auc =  0.9273224577512943
2769 346 347
e_roc_auc =  0.9271759304483735
2769 346 347
e_roc_auc =  0.9419426264205009
2769 346 347
e_roc_auc =  0.9223242486405523
2769 346 347
e_roc_auc =  0.9182052033473348
values_col =  label
processing feat: Fragment
2769 346 347
e_roc_auc =  0.6717137182117157
2769 346 347
e_roc_auc =  0.649555533847807
2769 346 347
e_roc_auc =  0.65724007684543
2769 