In [None]:
import numpy
import pandas
import tensorflow
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import SimilarityMaps
from rdkit import Chem, DataStructs
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE

#setting the seed for reproducibility
#seed = 10110
seed = 12061204
numpy.random.seed(seed)

#폴더의 파일목록 불러오기
import os
path_dir = './input'
file_list = os.listdir(path_dir)
input_files = []
i = 0

for item in file_list :
        input_files.append(item)
        #print(item)
        
        
def create_deep_learning_model():
    model = Sequential()
    #첫번째 인자 : 출력 뉴런의 수 , input_dim : 입력 뉴런의 수, activation : 활성화 함수.
    model.add(Dense(2048, input_dim=2048, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model    

def main(input_name):
    dataframe = pandas.read_csv(path_dir +'/'+input_name, sep="\t")
    dataframe2 = pandas.read_csv('result_tmp.csv')

    mols = []
    fps = []
    i = 0 #전체 물질 개수
    j = 0 #type 이 1인 물질 개수 즉 ligand개수
    
    for index, row in dataframe.iterrows():
        mol = Chem.MolFromSmiles(row['Smiles'])
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        mols.append(mol)
        fps.append(fp)
        i += 1
        if(row["Type"]==1) : 
            j += 1
    
    np_fps = []
    for fp in fps:
        arr = numpy.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
        
    np_fps_array = numpy.array(np_fps)
    
    #Need to encode my classes
    #Ligand = 0, not_ligand = 1
    encoder = LabelEncoder()
    encoder.fit(dataframe['Type'])
    enc_y = encoder.transform(dataframe['Type'])
    
    sm = SMOTE(random_state=12, ratio = 'minority')
    x_train_res, y_train_res = sm.fit_sample(np_fps_array, enc_y) #결과데이터

    # evaluate model with standardized dataset
    estimator = KerasClassifier(build_fn=create_deep_learning_model, nb_epoch=100, batch_size=5)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    results = cross_val_score(estimator, x_train_res, y_train_res, cv=kfold)
    va = results.mean()*100
    print("Results: %.2f%% (%.2f%%)" % (va, results.std()*100))

    y_pred = cross_val_predict(estimator, x_train_res, y_train_res, cv=kfold)
    conf_mat = confusion_matrix(y_train_res, y_pred)
    conf_mat

    model = create_deep_learning_model()
    model.fit(x_train_res, y_train_res, epochs=5, batch_size=5)
    model.save("./models/"+input_name+'_model'+'.h5')

    
    #모델정보 정리를 위한 코드.
    Total_Chemicals = i
    Ligand = j
    Non_ligand = i - j
    Model_accuacy = va
    tpr = conf_mat[0][0]/ (conf_mat[0][0] + conf_mat[0][1])
    fpr = conf_mat[1][0]/ (conf_mat[1][0] + conf_mat[1][1])
    info_list = [Total_Chemicals, Ligand, Non_ligand, Model_accuacy, tpr, fpr]
    dataframe2[input_name.replace("(input).csv","")] = info_list
    dataframe2.to_csv("./result.csv", mode='w')
    

if __name__ == "__main__":
    with tensorflow.device('/cpu:0'):
        for input_name in input_files :
            main(input_name)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Results: 95.70% (1.27%)
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Results: 97.93% (1.42%)
Epoch 1/1