# Fingerprint generator

In [None]:
# -*- Fingerprinter.py -*-
"""
Created Jan 2019

author: Elena Gelzintye / Timothy E H Allen
Code taken from: https://github.com/teha2/chemical_toxicology/tree/master/NeuralNetworks-March2020
"""
#%%

# Import modules

import pandas as pd 
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import MACCSkeys

import os

# Define paths and variables

'''
chemicals_path= binary activity file (.csv)
fingerprints_path= location for output (.csv)
fingerprint_length = length of genrerated fingerprint
fingerprint_radius = radius of gernerated fingerprint
'''
# Define ECFP fingerprinting procedure

def get_fingerprint(smiles):
    '''smiles dataframe'''
    
    rdkit_molecules=[Chem.MolFromSmiles(x) for x in smiles['SMILES']]
    rdkit_fingerprint=[]
    count = 0
    for mol in rdkit_molecules:
        if count % 1000 == 0:
            print('Now fingerprinting {} of {} for {}'.format(count,len(rdkit_molecules), receptor))
        bit_info={}
        fp=rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=fingerprint_radius, nBits=fingerprint_length, \
                                                                      bitInfo=bit_info).ToBitString() 
        
        rdkit_fingerprint.append(fp)
        count += 1
    fingerprint_df=pd.DataFrame([np.array(list(x)).astype(int) for x in rdkit_fingerprint])
    
    return fingerprint_df


# Get fingerprints
root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
filename = root_dir + 'Targets to calculate 5.csv'
receptor_df = pd.read_csv(filename)
receptor_ls = list(receptor_df['Target'])
print(len(receptor_ls))

#receptor_ls = ['AChE','ADORA2A','AR','hERG','SERT']
fp_ls = [2048]
fingerprint_radius = 2
root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
download = 'aop-wiki-xml-2021-04-01'
for receptor in receptor_ls:
    for length in fp_ls:

        fingerprint_length = length
        train_chemicals_path = root_dir + str(receptor) + '/' + str(receptor) + '_train.csv'
        test_chemicals_path = '/content/drive/My Drive/AOP project/Combined data/' + str(download) + ' combined data.csv'
        train_fingerprints_path = root_dir + str(receptor) + '/' + str(receptor) + '_train_fingerprints Morgan ' + str(length) + '.csv'
        test_fingerprints_path = '/content/drive/My Drive/AOP project/Combined data/' + str(download) + ' combined data fingerprints Morgan ' + str(length) + '.csv'


        #=====================================================================#
        
        if os.path.isfile(train_fingerprints_path) == True:
            print('\ngetting training fingerprints')
            smiles=pd.read_csv(train_chemicals_path)

            fingerprints=get_fingerprint(smiles)

            fingerprints = pd.concat([fingerprints,smiles['Binary Activity']], axis=1)

            # Outputs fingerprints
            fingerprints.to_csv(train_fingerprints_path, index = False)

        #=====================================================================#
        
        if os.path.isfile(test_fingerprints_path) == True:
            print('\ngetting test fingerprints')
            smiles=pd.read_csv(test_chemicals_path)
            smiles = smiles[['Smiles']]
            smiles.columns = ['SMILES']

            fingerprints=get_fingerprint(smiles)

            #fingerprints = pd.concat([fingerprints,smiles['Binary Activity']], axis=1)

            # Outputs fingerprints
            fingerprints.to_csv(test_fingerprints_path, index = False)

#Endgame

print("END")

#%% 

# Modeller

In [None]:
# -*- ChAIkeras.py -*-
"""
Created Oct 2019

author: Timothy E H Allen
Code taken from: https://github.com/teha2/chemical_toxicology/tree/master/NeuralNetworks-March2020
"""
#%%

# Import the usual suspects

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow import keras
from tensorflow.keras import layers
from keras import regularizers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.utils import class_weight
import random

# DEFINE INPUTS FOR MODEL TRAINING

# Get test targets
root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
filename = root_dir + 'Targets to calculate 5.csv'
receptor_df = pd.read_csv(filename)
receptor_ls = list(receptor_df['Target'])
#receptor_ls = ['hERG']

list1 = []
list2 = []
list3 = []
list4 = []

# Available receptors: 

train_receptor = 'AChE'
root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
filename = root_dir + str(train_receptor) + '/'
filename = filename + str(train_receptor) + '_train_fingerprints ECFP4 10000.csv'
#input_data_training = "/content/drive/My Drive/" + receptor + "_training_fingerprint.csv"
input_data_training = filename

# filename = root_dir + str(receptor) + '/'
# filename = filename + str(receptor) + '_test_fingerprints ECFP4 10000.csv'
# input_data_test = filename

rng_1 = random.randrange(1,1000)
rng_2 = random.randrange(1,1000)
validation_proportion = 0.25
beta = 0.1
neurons = 100
hidden_layers = 2
LR = 0.001
epochs = 100


print("Welcome to ChAI")
print("Dataset loading...")

# Reading The Dataset

def read_dataset(input_data):
    df = pd.read_csv(input_data)
    X = df[df.columns[0:10000]].values
    print(X)
    y = df[df.columns[-1]]
    print(y)
    # Encode the dependent variable
    encoder = LabelEncoder()
    encoder.fit(y)
    Y = encoder.transform(y)
    #print("X.shape =", X.shape)
    #print("Y.shape =", Y.shape)
    #print("y.shape =", y.shape)
    return (X, Y)

X, Y = read_dataset(input_data_training)

# # Get one molecule from test only
# single_mol = 14
# single_test_x = test_x[single_mol:single_mol+1,:]
# # print(single_test_x.shape)
# # print(single_test_x)
# single_test_y = test_y[single_mol:single_mol+1]
# # print(single_test_y.shape)
# # print(single_test_y)



# Shuffle the dataset

X, Y = shuffle(X, Y, random_state=rng_1)

# Convert the dataset into train and validation sets

train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size =validation_proportion, random_state=rng_2)

# Inspect the shape of the training and validation data

print("Dimensionality of data:")
print("Train x shape =", train_x.shape)
print("Train y shape =", train_y.shape)
print("Validation x shape =", valid_x.shape)
print("Validation y shape =", valid_y.shape)

class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(train_y),
                                                train_y)

#Define the model in keras

print("Constructing model architecture")

if hidden_layers == 1:
    inputs = keras.Input(shape=(10000,), name='digits')
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
    outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
elif hidden_layers == 2:
    inputs = keras.Input(shape=(10000,), name='digits')
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_2')(x)
    outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
elif hidden_layers == 3:
    inputs = keras.Input(shape=(10000,), name='digits')
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_2')(x)
    x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_3')(x)
    outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
else:
    print("Number of hidden layers outside this model scope, please choose 1, 2 or 3")

model = keras.Model(inputs = inputs, outputs = outputs)

model.compile(optimizer=keras.optimizers.Adam(lr=LR),
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

print('Commencing model training...')
history = model.fit(train_x, train_y,
                    batch_size=128,
                    epochs=epochs,
                    class_weight=class_weights,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=(valid_x, valid_y))

# The returned "history" object holds a record
# of the loss values and metric values during training

# Evaluate the model on the training and validation data
print('\n# Evaluate on training data')
train_results = model.evaluate(train_x, train_y, batch_size=128)
print('train loss, train acc:', train_results)

print('\n# Evaluate on validation data')
validation_results = model.evaluate(valid_x, valid_y, batch_size=128)
print('validation loss, validation acc:', validation_results)

# Save the model
for receptor in receptor_ls:
    count = 0
    model_path = root_dir + str(train_receptor) + '/Models vs other targets/Models/' + str(receptor) + '_model.h5'
    model.save(model_path)
    print('Model saved to ' + model_path)
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    filename = root_dir + str(receptor) + '/'
    filename = filename + str(receptor) + '_test_fingerprints ECFP4 10000.csv'
    input_data_test = filename
    test_x, test_y = read_dataset(input_data_test)

    
    pred_valid_y = model.predict(valid_x, verbose=1)
    pred_train_y = model.predict(train_x, verbose=1)
    pred_test_y = model.predict(test_x)

    # Define experimental and predicted values using argmax
    
    pred_train_y_binary = np.argmax(pred_train_y, axis=1)
    pred_valid_y_binary = np.argmax(pred_valid_y, axis=1)
    pred_test_y_binary = np.argmax(pred_test_y, axis=1)

    # Calculate and display confusion matricies
    def get_accuracy(cm):
        TP = cm[0][0]
        FP = cm[0][1]
        FN = cm[1][0]
        TN = cm[1][1]

        accuracy = (TP + TN) / (TP + FP + FN + TN)

        return accuracy

    
    cm = confusion_matrix(train_y, pred_train_y_binary)
    np.set_printoptions(precision=2)
    #print("Confusion matrix (Training), without normalisation")
    #print(cm)
    train_accuracy = get_accuracy(cm)

    cm = confusion_matrix(valid_y, pred_valid_y_binary)
    np.set_printoptions(precision=2)
    #print("Confusion matrix (Validation), without normalisation")
    #print(cm)
    valid_accuracy = get_accuracy(cm)

    cm = confusion_matrix(test_y, pred_test_y_binary)
    np.set_printoptions(precision=2)
    #print("Confusion matrix (Test), without normalisation")
    #print(cm)
    test_accuracy = get_accuracy(cm)

    # Append all values to lists
    list1.append(receptor)
    list2.append(train_accuracy)
    list3.append(valid_accuracy)
    list4.append(test_accuracy)

    # Get dataframe of accuracies
    accuracy_df = pd.DataFrame(list1,columns = ['Receptor'])
    accuracy_df['Train_acc'] = list2
    accuracy_df['Valid_acc'] = list3
    accuracy_df['Test_acc'] = list4
    print(accuracy_df)

    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    filename = root_dir + str(train_receptor) + '/Accuracies/'
    filename = filename + '_accuracies2 ECFP4 10000.csv'
    accuracy_df.to_csv(filename)



print("END")


# approx 5-10 min per model

# For calculating similarities of specified targets vs all 79 targets

In [None]:
"""
Created Jun 2021

Author: Marcus Wei How Wang
Code available at: https://github.com/Goodman-lab/AD-transferability
Please acknowledge the authors if using the code, whether partially or in full
"""

# For use with model vs other targets from Tim's code
# LATEST CODE UPDATED 20 MAR 2021
# Code for calculating applicability domain metric based on Tanimoto similarity

%%time
# Note google colab disconnects and clears data after 12 hrs of inactivity
# But code still runs in the background even if runtime is disconnected before the 12hr mark
# !pip install -I tensorflow
# !pip install -I keras
# Relevant imports
import numpy as np
import pandas as pd # uses pandas python module to view and analyse data

import time
from time import strftime, gmtime

from collections import Counter

import matplotlib.pyplot as plt

import itertools 

import os
from os import mkdir

from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw.MolDrawing import MolDrawing
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import MACCSkeys
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint

#=======================================================================================#
# READ REQUIRED FILES
# Files contain the SMILES string

# Read csv file containing targets to calculate
filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
#filename = filename + 'Targets to calculate 5.csv'
filename = filename + 'Targets for optimisation.csv'

target_df = pd.read_csv(filename)

target_df = target_df[['Target']]
print(target_df)

error_ls = []
error_ls.clear()

train_target_ls = ['AChE']

runs = 3

#=======================================================================================#
# Specify filenames for main code
# Training fingerprints
def train_fp_function():
    train_fp_filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    train_fp_filename = train_fp_filename + str(train_target) + '/'
    train_fp_filename = train_fp_filename + str(train_target) + '_train_fingerprints ECFP4 10000.csv'
    return train_fp_filename

# Folder to contain all data for test targets
# Each target is put in an individual folder
def test_target_folder_function():
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    root_dir = root_dir + str(train_target) + '/Models vs other targets/'
    foldername = target
    test_target_folder = root_dir + foldername
    return test_target_folder

# Test fingerprints
def test_fp_function():
    test_fp_filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    test_fp_filename = test_fp_filename + str(target) + '/'
    test_fp_filename = test_fp_filename + str(target) + '_test_fingerprints ECFP4 10000.csv'
    return test_fp_filename

# Save file containing train,test,average similarity indicator values per target
def indicator_function():
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    root_dir = root_dir + str(train_target) + '/Models vs other targets/'
    indicator_filename = root_dir  + str(target) + '/'
    indicator_filename = indicator_filename + str(target) + '_' + 'NEW_' + str(sample_size) + 'data_' + str(runs) + '_'
    indicator_filename = indicator_filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
    return indicator_filename

# Create folder to contain combined indicator values
def combined_indicator_folder_function():
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    root_dir = root_dir + str(train_target) + '/Models vs other targets/'
    foldername = str(target) + '/Combined'
    combined_indicator_foldername = root_dir + foldername
    return combined_indicator_foldername

# Combined indicator values file per target for all thresholds
def combined_indicator_function():
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    root_dir = root_dir + str(train_target) + '/Models vs other targets/'
    combined_indicator_filename = root_dir + str(target) + '/Combined/'
    combined_indicator_filename = combined_indicator_filename + str(target) + '_Combined_' + 'NEW_' + str(sample_size) + 'data_' + str(runs) + '_'
    combined_indicator_filename = combined_indicator_filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
    return combined_indicator_filename

# For saving all results in code
def all_results_function():
    all_results_filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    all_results_filename = all_results_filename + str(train_target) + '/Models vs other targets/'    
    all_results_filename = all_results_filename + '- All results/' + 'all_df_' + 'NEW_' + str(sample_size) + 'data_' + str(runs) + '_'
    all_results_filename = all_results_filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
    return all_results_filename

#=======================================================================================#

# Main code
for train_target in train_target_ls:
    sim_ls = []
    sim_ls.clear()

    mol_threshold_ls = []
    mol_threshold_ls.clear()

    temp_ls = []
    temp_ls.clear()
    target_count = 0

    # Read file 1 (training dataset)
    print ('\nReading file 1...')
    print ('Current time:')
    print(strftime("%H:%M:%S", gmtime())) 
    start = time.time()

    train_fp_df = pd.read_csv(train_fp_function())

    train_fp_df = train_fp_df.sample(frac=1)
    train_fp_df = train_fp_df.reset_index(drop=True)

    #print (train_fp_df)
    print (train_fp_df.shape)

    train_fp = train_fp_df.iloc[:,0:10000]
    train_fp['combined'] = train_fp[train_fp.columns.tolist()].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    train_fp = train_fp[['combined']]
    train_fp_active = train_fp_df.iloc[:,-1:]

    end = time.time()
    elapsed = end - start
    minutes = elapsed // 60
    seconds = elapsed - (minutes*60)
    print('File 1 took {} minutes and {} seconds to read'.format(minutes,seconds))

    for protein in range(0,len(target_df)):
        
        target = str(target_df.loc[protein]['Target'])
        print('For train_target: {}'.format(train_target))
        print('Processing target: {}'.format(protein))
        print('Processing target: {}'.format(target))
        
        # Create folders
        if os.path.isdir(test_target_folder_function()) == False:
            os.mkdir(test_target_folder_function())

        #if target != train_target:
        if True:

            # Read file 2 (test dataset)
            print ('\nReading file 2...')
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime())) 
            start = time.time()
                
            test_fp_df = pd.read_csv(test_fp_function())

            test_fp_df = test_fp_df.sample(frac=1)
            test_fp_df = test_fp_df.reset_index(drop=True)

            #print (test_fp_df)
            print (test_fp_df.shape)

            test_fp = test_fp_df.iloc[:,0:10000]
            test_fp['combined'] = test_fp[test_fp.columns.tolist()].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
            test_fp = test_fp[['combined']]
            test_fp_active = test_fp_df.iloc[:,-1:]

            end = time.time()
            elapsed = end - start
            minutes = elapsed // 60
            seconds = elapsed - (minutes*60)
            print('File 2 took {} minutes and {} seconds to read'.format(minutes,seconds))

            #=========================================================================================#

            start = time.time()
            print('\nPROCESSING AND SAMPLING DATA FROM BOTH FILES...')

            # IF SAMPLING
            # Get count of data points for both datasets
            # training_sample_fp = train_fp[0:500]
            # training_sample_active = train_fp_active[0:500]
            # test_sample_fp = test_fp[0:500]
            # test_sample_active = test_fp_active[0:500]

            # Get count of data points for both datasets
            sample_size = 500
            training_sample_fp = train_fp[0:sample_size]
            training_sample_active = train_fp_active[0:sample_size]
            test_sample_fp = test_fp[0:sample_size]
            test_sample_active = test_fp_active[0:sample_size]

            # IF NOT SAMPLING
            # Get count of data points for both datasets
            # training_sample_fp = train_fp
            # training_sample_active = train_fp_active
            # test_sample_fp = test_fp
            # test_sample_active = test_fp_active

            train_count_ls = []
            test_count_ls = []

            print('\nFILES PROCESSED AND SAMPLED')
            #=========================================================================================#
            start = time.time()
            print('\nCREATING BITVECT DFs FOR BOTH TRAINING AND TEST SAMPLES...')
            train_bit_ls = []
            test_bit_ls = []

            train_bit_ls.clear()
            test_bit_ls.clear()

            for x in range(0,len(training_sample_fp)):
                training_bitvect = DataStructs.CreateFromBitString(training_sample_fp.iloc[x]['combined'])
                train_bit_ls.append(training_bitvect)
            for y in range(0,len(test_sample_fp)):
                test_bitvect = DataStructs.CreateFromBitString(test_sample_fp.iloc[y]['combined'])
                test_bit_ls.append(test_bitvect)

            train_bv_df = pd.DataFrame(train_bit_ls,columns = ['BV'])
            test_bv_df = pd.DataFrame(test_bit_ls,columns = ['BV'])

            #print(train_bv_df)
            #print(test_bv_df)

            print('\nBITVECT DFs CREATED')
            #=========================================================================================#

            print('\nCALCULATING TRAINING TANIMOTO SIMILARITIES...')

            # Set similarity thresholds and no. of similar molecules threshold
            T_ls = [0.2]
            mol_ls = [1]
            nBits = 10000

            for element in T_ls:
                
                T_sim_threshold = float(element)

                for ele in mol_ls:
                    print('\nNOW CALCULATING FOR THRESHOLD: {}'.format(element))
                    print('\nNOW CALCULATING FOR nMOL: {}'.format(ele))
                    print('\nNOW CALCULATING FOR TARGET: {}'.format(target))

                    sim_count = 0
                    indicator_ls = []
                    indicator_ls.clear()

                    extract_index = []
                    extract_index.clear()

                    mol_threshold = int(ele)
                    # Calculate Tanimoto similarity between both samples
                    for x in range(0,len(train_bv_df)):
                        train_bv = train_bv_df.iloc[x]['BV']
                        training_active_state = training_sample_active.iloc[x]['Binary Activity']
                        if x % 100 == 0:
                            print ('Current time:')
                            print(strftime("%H:%M:%S", gmtime()))
                            print('NOW CALCULATING SIMILARITIES FOR TRAINING INDEX {}'.format(x))


                        for y in range(0,len(test_bv_df)):
                            temp_count = 0
                            test_bv = test_bv_df.iloc[y]['BV']
                            sim = DataStructs.TanimotoSimilarity(train_bv,test_bv)
                            test_active_state = test_sample_active.iloc[y]['Binary Activity']

                            # Process sim per molecule and determine if molecule is similar to second dataset                            
                            if sim >= T_sim_threshold:
                                if test_active_state == training_active_state:
                                    temp_count += 1
                        
                                    if temp_count >= mol_threshold:
                                        sim_count += 1
                                        break
                          
                    print('\nTRAINING TANIMOTO SIMILARITIES CALCULATED')

                    #=========================================================================================#
                    print('\nCALCULATING TRAINING SIMILARITY INDICATOR...')

                    train_indicator = sim_count / len(train_bv_df) * 100
                    indicator_ls.append(train_indicator)

                    print('\nTRAINING SIMILARITY INDICATOR CALCULATED')
                    #=========================================================================================#

                    print('\nCALCULATING TEST TANIMOTO SIMILARITIES...')
                    
                    # Calculate Tanimoto similarity between both samples
                    sim_count = 0
                    for x in range(0,len(test_bv_df)):
                        test_bv = test_bv_df.iloc[x]['BV']
                        test_active_state = test_sample_active.iloc[x]['Binary Activity']
                        if x % 100 == 0:
                            print ('Current time:')
                            print(strftime("%H:%M:%S", gmtime()))
                            print('NOW CALCULATING SIMILARITIES FOR TRAINING INDEX {}'.format(x))


                        for y in range(0,len(train_bv_df)):
                            temp_count = 0
                            train_bv = train_bv_df.iloc[y]['BV']
                            sim = DataStructs.TanimotoSimilarity(train_bv,test_bv)
                            training_active_state = training_sample_active.iloc[y]['Binary Activity']

                            # Process sim per molecule and determine if molecule is similar to second dataset                            
                            if sim >= T_sim_threshold:
                                if test_active_state == training_active_state:
                                    temp_count += 1
                        
                                    if temp_count >= mol_threshold:
                                        sim_count += 1
                                        break
                          
                    print('\nTEST TANIMOTO SIMILARITIES CALCULATED')
                    #=========================================================================================#
                    print('\nCALCULATING TEST SIMILARITY INDICATOR...')

                    test_indicator = sim_count / len(test_bv_df) * 100
                    indicator_ls.append(test_indicator)

                    print('\nTEST SIMILARITY INDICATOR CALCULATED')

                    #=========================================================================================#
                    print('\nCALCULATING AVERAGE SIMILARITY INDICATOR...')

                    average_indicator = (train_indicator + test_indicator) / 2
                    indicator_ls.append(average_indicator)
                    

                    indicator_df = pd.DataFrame(indicator_ls).T
                    indicator_df.columns = ['Train','Test','Average']

                    print(indicator_df)

                    print('\nAVERAGE SIMILARITY INDICATOR CALCULATED')

                    #=========================================================================================#

                    print ('\nSAVING FILES...')
                    start = time.time()
                    print ('Current time:')
                    print(strftime("%H:%M:%S", gmtime()))

                    indicator_df.to_csv(indicator_function())

                    end = time.time()
                    elapsed = end - start
                    minutes = elapsed // 60
                    seconds = elapsed - (minutes*60)
                    print('Files took {} minutes and {} seconds to save'.format(minutes,seconds))
                    print ('\n#=========================================================================#')
                    train_count_ls.append(len(training_sample_fp))
                    test_count_ls.append(len(test_sample_fp))

            # Collate all indicator files/results
            print('\nCOLLATING ALL FILES...')
            f_count = 0

            sim_ls = []
            sim_ls.clear()

            mol_threshold_ls = []
            mol_threshold_ls.clear()

            temp_ls = []
            temp_ls.clear()

            for element in T_ls:
                T_sim_threshold = float(element)

                for ele in mol_ls:
                    mol_threshold = int(ele)
                  
                    # Read file 
                    print ('\nReading file ...')
                    print ('Current time:')
                    print(strftime("%H:%M:%S", gmtime())) 
                    start = time.time()

                    file1 = pd.read_csv(indicator_function())
                    file1mod = file1.drop(['Unnamed: 0'], axis=1)

                    end = time.time()
                    elapsed = end - start
                    minutes = elapsed // 60
                    seconds = elapsed - (minutes*60)
                    print('File 1 took {} minutes and {} seconds to read'.format(minutes,seconds))

                    if f_count == 0:
                        combined_df = file1mod
                        f_count += 1

                    else:
                        combined_df = pd.concat([combined_df,file1mod],axis = 0)
                        
                    sim_ls.append(T_sim_threshold)
                    mol_threshold_ls.append(mol_threshold)
                    temp_ls.append(target)

            print('\nALL FILES COLLATED')
            #===========================================================================#
            # Process df and save

            print('\nSAVING DF FOR TARGET {}'.format(protein))
            combined_df['Sim'] = sim_ls
            combined_df['No. of mol'] = mol_threshold_ls
            combined_df['Target'] = temp_ls
            combined_df['train_target count'] = train_count_ls
            combined_df['test_target count'] = test_count_ls

            print(combined_df)

            print ('\nSAVING FILES...')
            start = time.time()
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime()))

            # Create folder in google drive
            if os.path.isdir(combined_indicator_folder_function()) == False:
                os.mkdir(combined_indicator_folder_function())

            combined_df.to_csv(combined_indicator_function())
            
            if target_count == 0:

                #print('FIRST LINE')
                all_df = combined_df
                target_count += 1
            else:
                #print('NEXT LINES')
                all_df = pd.concat([all_df,combined_df],axis=0)

            end = time.time()
            elapsed = end - start
            minutes = elapsed // 60
            seconds = elapsed - (minutes*60)
            print('Files took {} minutes and {} seconds to save'.format(minutes,seconds))
            print ('\n#=========================================================================#')

            # Save all_df (combined df with all calculated targets for easy copying)
            print('\nSAVING ALL RESULTS FILE...')

            all_df.to_csv(all_results_function())
            print('\nALL RESULTS FILE SAVED')

print('\nFINISHED')

# For calculating similarities of specified targets vs all 79 targets (with Multiprocessing)

In [None]:
"""
Created Jun 2021

Author: Marcus Wei How Wang
Code available at: https://github.com/Goodman-lab/AD-transferability
Please acknowledge the authors if using the code, whether partially or in full
"""
# For use with model vs other targets from Tim's code
# LATEST CODE UPDATED 29 Apr 2021
# Code for calculating applicability domain metric based on Tanimoto similarity

%%time
# Note google colab disconnects and clears data after 12 hrs of inactivity
# But code still runs in the background even if runtime is disconnected before the 12hr mark

# Relevant imports
import numpy as np
import pandas as pd # uses pandas python module to view and analyse data

import time
from time import strftime, gmtime

import itertools

import multiprocessing
from multiprocessing import Process, Queue
from multiprocessing import Pool

import os
from os import mkdir

from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import AllChem


#=======================================================================================#
# READ REQUIRED FILES
print('\nSETTING UP TARGETS...')
# Files contain the fingerprints and labels

# Set test targets
# Read csv file containing targets to calculate
filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
filename = filename + 'Targets to calculate 5.csv'
#filename = filename + 'Targets for optimisation.csv'

target_df = pd.read_csv(filename)
target_df = target_df[['Target']]
print(target_df)

# Set training targets
#train_target_ls = ['AChE','ADORA2A','AR','hERG','SERT']
#train_target_ls = ['ADORA2A','AR','hERG','SERT']
#train_target_ls = ['AChE']
#train_target_ls = ['ADORA2A']

train_target_ls = ['SERT']
checkpoint_target = 'SERT'
state = 'Yes'

#train_target_ls = ['hERG']
#train_target_ls = ['hERG','SERT']
#train_target_ls = ['SERT']

#===================================================================================================#
# Define some functions

# Function splits a df into actives and inactives and returns two dfs
def active_inactive_split(df):
    #print('\nactive_inactive_split CALLED')

    active_index = []
    active_index.clear()

    # Get index of actives
    temp = df.iloc[:,-1:]
    temp_df = df
    for x in range(0,len(temp)):
        if temp.iloc[x]['Binary Activity'] == 1:
            active_index.append(x)
    
    # Seperate actives and inactives intwo two dfs
    #print('\nCREATING ACTIVE DF...')
    active_df = temp_df.iloc[active_index]    
    active_df = active_df.reset_index(drop=True)

    #print('\nCREATING INACTIVE DF...')
    inactive_df = df.drop(df.index[[active_index]])
    inactive_df = inactive_df.reset_index(drop=True)
    
    return active_df, inactive_df

def feature_label_split(df):
    #print('\nfeature_label_split CALLED')

    # Combine all columns with fingerprint values into format suitable for bitvector processing
    fp = df.iloc[:,0:10000]
    fp['combined'] = fp[fp.columns.tolist()].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    fp = fp[['combined']]

    # Get active states as df
    fp_label = df.iloc[:,-1:]

    return fp, fp_label

def CreateBitVect(df):
    #print('\nCreateBitVect')

    df['BV'] = df.apply(lambda row : DataStructs.CreateFromBitString(row['combined']), axis = 1)
    df = df[['BV']]
    return df

def Calc_Similarity(bv1,bv2,label1,label2,que,process):
    #print('\nCalc_Similarity CALLED')
    # Calculate Tanimoto similarity between both samples
    sim_count = 0

    for x in range(0,len(bv1)):
        bv1_bv = bv1.iloc[x]['BV']
        label1_label = label1.iloc[x]['Binary Activity']

        if x % 500 == 0:
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime()))
            print('NOW CALCULATING SIMILARITIES FOR PROCESS {} INDEX {}'.format(process,x))

        for y in range(0,len(bv2)):

            bv2_bv = bv2.iloc[y]['BV']
            sim = DataStructs.TanimotoSimilarity(bv1_bv,bv2_bv)
            label2_label = label2.iloc[y]['Binary Activity']

            # Process sim per molecule and determine if molecule is similar to second dataset                            
            if sim >= T_sim_threshold:
                if label1_label == label2_label:        
                    sim_count += 1
                    break
    
    #print('\nTANIMOTO SIMILARITIES CALCULATED')
    que.put(sim_count)
    return sim_count

#===================================================================================================#
# IF starting from incomplete file/data ie. checkpoint:

def Checkpoint(checkpoint_target, state):
    nBits = 10000
    T_sim_threshold = 0.3

    if state == 'Yes':
        filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
        filename = filename + str(checkpoint_target) + '/Models vs other targets/'    
        filename = filename + '- All results/' + 'all_df_' + 'NEW_multialldata_'
        filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_mol_sim_indicator.csv'

        all_df = pd.read_csv(filename)
        all_df = all_df.drop(['Unnamed: 0'],axis=1)

        # Drop last row since might be incomplete
        all_df = all_df.iloc[:-1]

        initial = len(all_df) + 1


    if state == 'No':
        all_df = 0
        initial = 0

    return initial,all_df

#===================================================================================================#

# For checkpoint
# If not starting from checkpoint, change state to 'No'

initial, all_df = Checkpoint(checkpoint_target,state)
print('\nCheckpoint function return parameters: Initial, All_df')
print(initial,'\n', all_df)

# Start main loop
for train_target in train_target_ls:
    sim_ls = []
    sim_ls.clear()

    mol_threshold_ls = []
    mol_threshold_ls.clear()

    temp_ls = []
    temp_ls.clear()
    if isinstance(all_df, pd.DataFrame) == False:
        target_count = 0
    else:
        target_count = 1     

    # Read file 1 (training dataset)
    print ('\nReading file 1...')
    print ('Current time:')
    print(strftime("%H:%M:%S", gmtime())) 
    start = time.time()

    # Create folder in google drive
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    root_dir = root_dir + str(train_target) + '/Models vs other targets/'
        
    filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    filename = filename + str(train_target) + '/'
    filename = filename + str(train_target) + '_train_fingerprints ECFP4 10000.csv'
    train_fp_df = pd.read_csv(filename)

    train_fp_df = train_fp_df.sample(frac=1)
    train_fp_df = train_fp_df.reset_index(drop=True)

    #print (train_fp_df)
    print (train_fp_df.shape)

    # Split training data into actives and inactives
    print('\nSPLITTING TRAINING DATA...')
    train_active, train_inactive = active_inactive_split(train_fp_df)
    train_active_fp, train_active_label = feature_label_split(train_active)
    train_inactive_fp, train_inactive_label = feature_label_split(train_inactive)
    print('\nTRAINING DATA SPLIT')

    print('\nCREATING BITVECT DFs FOR TRAINING DATA...')
    train_active_bv_df = CreateBitVect(train_active_fp)
    train_inactive_bv_df = CreateBitVect(train_inactive_fp)
    print('\nTRAINING BITVECT DFs CREATED')

    end = time.time()
    elapsed = end - start
    minutes = elapsed // 60
    seconds = elapsed - (minutes*60)
    print('File 1 took {} minutes and {} seconds to read'.format(minutes,seconds))

    for protein in range(initial,len(target_df)):
        
        target = str(target_df.loc[protein]['Target'])

        print('For train_target: {}'.format(train_target))
        print('Processing target: {}'.format(protein))
        print('Processing target: {}'.format(target))
        
        # Create folders
        root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
        root_dir = root_dir + str(train_target) + '/Models vs other targets/'
        foldername = target
        if os.path.isdir(root_dir + foldername) == False:
            os.mkdir(root_dir + foldername)

        if target != train_target:
        #if True:

            # Read file 2 (test dataset)
            print ('\nReading file 2...')
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime())) 
            start = time.time()
                
            filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
            filename = filename + str(target) + '/'
            filename = filename + str(target) + '_test_fingerprints ECFP4 10000.csv'
            test_fp_df = pd.read_csv(filename)

            test_fp_df = test_fp_df.sample(frac=1)
            test_fp_df = test_fp_df.reset_index(drop=True)

            #print (test_fp_df)
            print (test_fp_df.shape)

            # Split training data into actives and inactives
            test_active, test_inactive = active_inactive_split(test_fp_df)
            test_active_fp, test_active_label = feature_label_split(test_active)
            test_inactive_fp, test_inactive_label = feature_label_split(test_inactive)

            end = time.time()
            elapsed = end - start
            minutes = elapsed // 60
            seconds = elapsed - (minutes*60)
            print('File 2 took {} minutes and {} seconds to read'.format(minutes,seconds))

            #=========================================================================================#

            print('\nCREATING BITVECT DFs FOR TEST DATA...')
            test_active_bv_df = CreateBitVect(test_active_fp)
            test_inactive_bv_df = CreateBitVect(test_inactive_fp)

            print('\nTEST BITVECT DFs CREATED')
            #=========================================================================================#

            # Caclulate Tanimoto similarities
            # Slow process
            print('\nCALCULATING TANIMOTO SIMILARITIES...')

            # Set similarity threshold
            T_ls = [0.3]
            nBits = 10000

            for element in T_ls:
                
                T_sim_threshold = float(element)
                
                print('\nNOW CALCULATING FOR THRESHOLD: {}'.format(element))
                print('\nNOW CALCULATING FOR TARGET: {}'.format(target))
                
                indicator_ls = []
                indicator_ls.clear()

                train_count_ls = []
                train_count_ls.clear()

                test_count_ls = []
                test_count_ls.clear()

                # Ensure that training bv df is followed by teste bv df for train sim indicator
                # Ensure that actives are matched with actives and vice versa
                # Start multiprocessing and queues
                start = time.time()

                queue1 = Queue()
                queue2 = Queue() 
                queue3 = Queue()
                queue4 = Queue()

                p1 = Process(target= Calc_Similarity, args= (train_active_bv_df,test_active_bv_df,train_active_label,test_active_label,queue1,1))
                p2 = Process(target= Calc_Similarity, args= (train_inactive_bv_df,test_inactive_bv_df,train_inactive_label,test_inactive_label,queue2,2))  

                p1.start()
                p2.start()


                p1.join()
                print('\n#=====================================================================#')
                print('Process 1 ENDED')
                print('\n#=====================================================================#')
                p2.join()
                print('\n#=====================================================================#')
                print('Process 2 ENDED')
                print('\n#=====================================================================#')

                # Get return values via queues for training with test
                temp_count1 = queue1.get()
                temp_count2 = queue2.get()

                # Get sim_count for training with test
                sim_count = temp_count1 + temp_count2
                p1.terminate()
                p2.terminate()
                queue1.put('exit')
                queue2.put('exit')

                p3 = Process(target= Calc_Similarity, args= (test_active_bv_df,train_active_bv_df,test_active_label,train_active_label,queue3,3))
                p4 = Process(target= Calc_Similarity, args= (test_inactive_bv_df,train_inactive_bv_df,test_inactive_label,train_inactive_label,queue4,4))  
                p3.start()
                p4.start()

                p3.join()
                print('\n#=====================================================================#')
                print('Process 3 ENDED')
                print('\n#=====================================================================#')
                p4.join()
                print('\n#=====================================================================#')
                print('Process 4 ENDED')
                print('\n#=====================================================================#')

                # Get return values via queues for test with training
                temp_count3 = queue3.get()
                temp_count4 = queue4.get()

                # Get sim_count for test with training
                sim_count2 = temp_count3 + temp_count4
                p3.terminate()
                p4.terminate()
                queue3.put('exit')
                queue4.put('exit') 

                end = time.time()
                elapsed = end - start
                minutes = elapsed // 60
                seconds = elapsed - (minutes*60)

                print('\n#=====================================================================#')
                print('\n#=====================================================================#')
                print('\nTRAINING AND TEST TANIMOTO SIMILARITIES CALCULATED')
                print('Code took {} minutes and {} seconds'.format(minutes,seconds))
                print('\n#=====================================================================#')
                print('\n#=====================================================================#')

                #=========================================================================================#
                print('\nCALCULATING TRAINING SIMILARITY INDICATOR...')

                train_indicator = sim_count / (len(train_active_bv_df)+len(train_inactive_bv_df)) * 100
                indicator_ls.append(train_indicator)

                print('\nTRAINING SIMILARITY INDICATOR CALCULATED')
                #=========================================================================================#

                print('\nCALCULATING TEST SIMILARITY INDICATOR...')

                test_indicator =  sim_count2 / (len(test_active_bv_df)+len(test_inactive_bv_df)) * 100
                indicator_ls.append(test_indicator)

                print('\nTEST SIMILARITY INDICATOR CALCULATED')

                #=========================================================================================#
                print('\nCALCULATING AVERAGE SIMILARITY INDICATOR...')

                average_indicator = (train_indicator + test_indicator) / 2
                indicator_ls.append(average_indicator)
                
                indicator_df = pd.DataFrame(indicator_ls).T
                indicator_df.columns = ['Train','Test','Average']

                print(indicator_df)

                print('\nAVERAGE SIMILARITY INDICATOR CALCULATED')

                #=========================================================================================#

                print ('\nSAVING FILES...')
                start = time.time()
                print ('Current time:')
                print(strftime("%H:%M:%S", gmtime()))
                root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
                root_dir = root_dir + str(train_target) + '/Models vs other targets/'

                filename = root_dir
                filename = filename  + str(target) + '/'
                filename = filename + str(target) + '_' + 'NEW_multialldata_'
                filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_mol_sim_indicator.csv'
                indicator_df.to_csv(filename)

                end = time.time()
                elapsed = end - start
                minutes = elapsed // 60
                seconds = elapsed - (minutes*60)
                print('Files took {} minutes and {} seconds to save'.format(minutes,seconds))
                print ('\n#=========================================================================#')

                # Append data counts in dfs
                train_count_ls.append((len(train_active_bv_df)+len(train_inactive_bv_df)))
                test_count_ls.append((len(test_active_bv_df)+len(test_inactive_bv_df)))

            #=========================================================================================#
            # Collate all files/results
            # Quick process
            print('\nCOLLATING ALL FILES FOR TARGET {}...'.format(target))
            f_count = 0

            sim_ls = []
            sim_ls.clear()

            temp_ls = []
            temp_ls.clear()

            for element in T_ls:
                T_sim_threshold = float(element)
              
                # Read file 
                print ('\nReading file ...')
                print ('Current time:')
                print(strftime("%H:%M:%S", gmtime())) 
                start = time.time()
                root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
                root_dir = root_dir + str(train_target) + '/Models vs other targets/'

                filename = root_dir + str(target) + '/'
                filename = filename + str(target) + '_' + 'NEW_multialldata_'
                filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_mol_sim_indicator.csv'
                file1 = pd.read_csv(filename)
                file1mod = file1.drop(['Unnamed: 0'], axis=1)

                end = time.time()
                elapsed = end - start
                minutes = elapsed // 60
                seconds = elapsed - (minutes*60)
                print('File 1 took {} minutes and {} seconds to read'.format(minutes,seconds))

                if f_count == 0:
                    combined_df = file1mod
                    f_count += 1

                else:
                    combined_df = pd.concat([combined_df,file1mod],axis = 0)
                    
                sim_ls.append(T_sim_threshold)
                temp_ls.append(target)

            combined_df['Sim'] = sim_ls
            combined_df['Target'] = temp_ls
            combined_df['train_target count'] = train_count_ls
            combined_df['test_target count'] = test_count_ls
            print(combined_df)

            print('\nALL FILES COLLATED FOR TARGET {}'.format(target))
            #===========================================================================#
            # Process df and save

            print('\nSAVING DF FOR TARGET {}'.format(protein))
            print ('\nSAVING FILES...')
            start = time.time()
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime()))

            # Create folder in google drive
            root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
            root_dir = root_dir + str(train_target) + '/Models vs other targets/'

            # Create folders 
            foldername = str(target) + '/Combined'
            if os.path.isdir(root_dir + foldername) == False:
                os.mkdir(root_dir + foldername)

            filename = root_dir + str(target) + '/Combined/'
            filename = filename + str(target) + '_Combined_' + 'NEW_multialldata_'
            filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_mol_sim_indicator.csv'
            combined_df.to_csv(filename)
            
            if target_count == 0:
                all_df = combined_df
                target_count += 1
            else:
                all_df = pd.concat([all_df,combined_df],axis=0)

            end = time.time()
            elapsed = end - start
            minutes = elapsed // 60
            seconds = elapsed - (minutes*60)
            print('Files took {} minutes and {} seconds to save'.format(minutes,seconds))
            print ('\n#=========================================================================#')

            #===========================================================================================================================#
            # Save all_df in separate easy to access folder (combined df with all calculated targets for easy copying)
            print('\nSAVING ALL RESULTS FILE...')
            filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
            filename = filename + str(train_target) + '/Models vs other targets/'    
            filename = filename + '- All results/' + 'all_df_' + 'NEW_multialldata_'
            filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_mol_sim_indicator.csv'
            all_df.to_csv(filename)
            print('\nALL RESULTS FILE SAVED')

    target_count = 0

print('\nFINISHED')



# For scrubbing datasets

In [None]:
"""
Created Jun 2021

Author: Marcus Wei How Wang
Code available at: https://github.com/Goodman-lab/AD-transferability
Please acknowledge the authors if using the code, whether partially or in full
"""
# For use with model vs other targets from Tim's code
# LATEST CODE UPDATED 20 MAR 2021
# Code for calculating applicability domain metric based on Tanimoto similarity

%%time
# Note google colab disconnects and clears data after 12 hrs of inactivity
# But code still runs in the background even if runtime is disconnected before the 12hr mark
# !pip install -I tensorflow
# !pip install -I keras
# Relevant imports
import numpy as np
import pandas as pd # uses pandas python module to view and analyse data

import time
from time import strftime, gmtime

from collections import Counter

import matplotlib.pyplot as plt

import itertools 

import os
from os import mkdir

from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw.MolDrawing import MolDrawing
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import MACCSkeys
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint

#=======================================================================================#
# READ REQUIRED FILES
# Files contain the SMILES string

# Read csv file containing targets to calculate
filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
#filename = filename + 'Targets to calculate 5.csv'
filename = filename + 'Targets for optimisation.csv'

target_df = pd.read_csv(filename)

target_df = target_df[['Target']]
print(target_df)

error_ls = []
error_ls.clear()

train_target_ls = ['AChE','ADORA2A','AR','hERG','SERT']
#train_target_ls = ['ADORA2A','AR','hERG','SERT']
#train_target_ls = ['AChE']
#train_target_ls = ['AChE','ADORA2A']
#train_target_ls = ['AR','hERG']
#train_target_ls = ['hERG','SERT']
#train_target_ls = ['SERT']



for train_target in train_target_ls:
    sim_ls = []
    sim_ls.clear()

    mol_threshold_ls = []
    mol_threshold_ls.clear()

    temp_ls = []
    temp_ls.clear()
    target_count = 0

    # Read file 1 (training dataset)
    print ('\nReading file 1...')
    print ('Current time:')
    print(strftime("%H:%M:%S", gmtime())) 
    start = time.time()
        
    filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    filename = filename + str(train_target) + '/'
    filename = filename + str(train_target) + '_train_fingerprints ECFP4 10000.csv'
    train_fp_df = pd.read_csv(filename)

    train_fp_df = train_fp_df.sample(frac=1)
    train_fp_df = train_fp_df.reset_index(drop=True)

    #print (train_fp_df)
    print (train_fp_df.shape)

    train_fp = train_fp_df.iloc[:,0:10000]
    train_fp['combined'] = train_fp[train_fp.columns.tolist()].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    train_fp = train_fp[['combined']]
    train_fp_active = train_fp_df.iloc[:,-1:]

    end = time.time()
    elapsed = end - start
    minutes = elapsed // 60
    seconds = elapsed - (minutes*60)
    print('File 1 took {} minutes and {} seconds to read'.format(minutes,seconds))

    for protein in range(0,len(target_df)):
        
        target = str(target_df.loc[protein]['Target'])
        print('For train_target: {}'.format(train_target))
        print('Processing target: {}'.format(protein))
        print('Processing target: {}'.format(target))
        
        # Create folders
        root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/'
        root_dir = root_dir + str(train_target) + '/'
        foldername = target
        if os.path.isdir(root_dir + foldername) == False:
            os.mkdir(root_dir + foldername)

        #if target != train_target:
        if True:

            # Read file 2 (test dataset)
            print ('\nReading file 2...')
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime())) 
            start = time.time()
                
            filename = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
            filename = filename + str(target) + '/'
            filename = filename + str(target) + '_test_fingerprints ECFP4 10000.csv'
            test_fp_df = pd.read_csv(filename)

            test_fp_df = test_fp_df.sample(frac=1)
            test_fp_df = test_fp_df.reset_index(drop=True)

            #print (test_fp_df)
            print (test_fp_df.shape)

            test_fp = test_fp_df.iloc[:,0:10000]
            test_fp['combined'] = test_fp[test_fp.columns.tolist()].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
            test_fp = test_fp[['combined']]
            test_fp_active = test_fp_df.iloc[:,-1:]

            end = time.time()
            elapsed = end - start
            minutes = elapsed // 60
            seconds = elapsed - (minutes*60)
            print('File 2 took {} minutes and {} seconds to read'.format(minutes,seconds))

            #=========================================================================================#

            start = time.time()
            print('\nPROCESSING AND SAMPLING DATA FROM BOTH FILES...')

            # IF SAMPLING
            # Get count of data points for both datasets
            # sample_size = 4000
            # training_sample_fp = train_fp[0:sample_size]
            # training_sample_active = train_fp_active[0:sample_size]
            # test_sample_fp = test_fp[0:sample_size]
            # test_sample_active = test_fp_active[0:sample_size]

            # IF NOT SAMPLING
            # Get count of data points for both datasets
            sample_size = 'all'
            training_sample_fp = train_fp
            training_sample_active = train_fp_active
            test_sample_fp = test_fp
            test_sample_active = test_fp_active

            train_count_ls = []
            test_count_ls = []

            print('\nFILES PROCESSED AND SAMPLED')
            #=========================================================================================#
            start = time.time()
            print('\nCREATING BITVECT DFs FOR BOTH TRAINING AND TEST SAMPLES...')
            train_bit_ls = []
            test_bit_ls = []

            train_bit_ls.clear()
            test_bit_ls.clear()

            for x in range(0,len(training_sample_fp)):
                training_bitvect = DataStructs.CreateFromBitString(training_sample_fp.iloc[x]['combined'])
                train_bit_ls.append(training_bitvect)
            for y in range(0,len(test_sample_fp)):
                test_bitvect = DataStructs.CreateFromBitString(test_sample_fp.iloc[y]['combined'])
                test_bit_ls.append(test_bitvect)

            train_bv_df = pd.DataFrame(train_bit_ls,columns = ['BV'])
            test_bv_df = pd.DataFrame(test_bit_ls,columns = ['BV'])

            print('\nBITVECT DFs CREATED')
            #=========================================================================================#

            print('\nCALCULATING TRAINING TANIMOTO SIMILARITIES...')

            # Set similarity thresholds and no. of similar molecules threshold
            T_ls = [0.2]
            mol_ls = [1]
            nBits = 10000

            for element in T_ls:
                
                T_sim_threshold = float(element)

                for ele in mol_ls:
                    print('\nNOW CALCULATING FOR THRESHOLD: {}'.format(element))
                    print('\nNOW CALCULATING FOR nMOL: {}'.format(ele))
                    print('\nNOW CALCULATING FOR TARGET: {}'.format(target))

                    sim_count = 0
                    indicator_ls = []
                    indicator_ls.clear()

                    train_extract_index = []
                    train_extract_index.clear()

                    mol_threshold = int(ele)

                    # Calculate Tanimoto similarity between both samples
                    for x in range(0,len(train_bv_df)):
                        train_bv = train_bv_df.iloc[x]['BV']
                        training_active_state = training_sample_active.iloc[x]['Binary Activity']
                        if x % 100 == 0:
                            print ('Current time:')
                            print(strftime("%H:%M:%S", gmtime()))
                            print('NOW CALCULATING SIMILARITIES FOR TRAINING INDEX {}'.format(x))


                        for y in range(0,len(test_bv_df)):
                            temp_count = 0
                            test_bv = test_bv_df.iloc[y]['BV']
                            sim = DataStructs.TanimotoSimilarity(train_bv,test_bv)
                            test_active_state = test_sample_active.iloc[y]['Binary Activity']

                            # Process sim per molecule and determine if molecule is similar to second dataset                            
                            if sim >= T_sim_threshold:
                                if test_active_state == training_active_state:
                                    temp_count += 1
                        
                                    if temp_count >= mol_threshold:
                                        sim_count += 1
                                        train_extract_index.append(x)
                                        break
                          
                    print('\nTRAINING TANIMOTO SIMILARITIES CALCULATED')

                    #=========================================================================================#
                    print('\nCREATING TRAIN EXTRACT_DF...')


                    train_extract_df = train_fp_df.iloc[train_extract_index]

                    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/scrubbed data/'
                    filename = root_dir  + str(train_target) + '/'
                    filename = filename + str(train_target) + '_' + 'training_data_' + str(target) + 'test_target_'
                    filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
                    train_extract_df.to_csv(filename)

                    print('\nTRAIN EXTRACT_DF CREATED')

                    #=========================================================================================#
                    print('\nCALCULATING TRAINING SIMILARITY INDICATOR...')

                    train_indicator = sim_count / len(train_bv_df) * 100
                    indicator_ls.append(train_indicator)

                    print('\nTRAINING SIMILARITY INDICATOR CALCULATED')
                    #=========================================================================================#

                    print('\nCALCULATING TEST TANIMOTO SIMILARITIES...')
                    
                    # Calculate Tanimoto similarity between both samples
                    sim_count = 0
                    test_extract_index = []
                    test_extract_index.clear()

                    for x in range(0,len(test_bv_df)):
                        test_bv = test_bv_df.iloc[x]['BV']
                        test_active_state = test_sample_active.iloc[x]['Binary Activity']
                        if x % 100 == 0:
                            print ('Current time:')
                            print(strftime("%H:%M:%S", gmtime()))
                            print('NOW CALCULATING SIMILARITIES FOR TRAINING INDEX {}'.format(x))


                        for y in range(0,len(train_bv_df)):
                            temp_count = 0
                            train_bv = train_bv_df.iloc[y]['BV']
                            sim = DataStructs.TanimotoSimilarity(train_bv,test_bv)
                            training_active_state = training_sample_active.iloc[y]['Binary Activity']

                            # Process sim per molecule and determine if molecule is similar to second dataset                            
                            if sim >= T_sim_threshold:
                                if test_active_state == training_active_state:
                                    temp_count += 1
                        
                                    if temp_count >= mol_threshold:
                                        sim_count += 1
                                        test_extract_index.append(x)
                                        break
                          
                    print('\nTEST TANIMOTO SIMILARITIES CALCULATED')

                    #=========================================================================================#
                    print('\nCREATING TEST EXTRACT_DF...')

                    test_extract_df = test_fp_df.iloc[test_extract_index]

                    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/scrubbed data/'
                    filename = root_dir  + str(train_target) + '/'
                    filename = filename + str(train_target) + '_' + 'test_data_' + str(target) + 'test_target_'
                    filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
                    test_extract_df.to_csv(filename)

                    print('\nTEST EXTRACT_DF CREATED')

                    #=========================================================================================#
                    print('\nCALCULATING TEST SIMILARITY INDICATOR...')

                    test_indicator = sim_count / len(test_bv_df) * 100
                    indicator_ls.append(test_indicator)

                    print('\nTEST SIMILARITY INDICATOR CALCULATED')

                    #=========================================================================================#
                    print('\nCALCULATING AVERAGE SIMILARITY INDICATOR...')

                    average_indicator = (train_indicator + test_indicator) / 2
                    indicator_ls.append(average_indicator)
                    

                    indicator_df = pd.DataFrame(indicator_ls).T
                    indicator_df.columns = ['Train','Test','Average']

                    print(indicator_df)

                    print('\nAVERAGE SIMILARITY INDICATOR CALCULATED')

                    #=========================================================================================#

                    print ('\nSAVING FILES...')
                    start = time.time()
                    print ('Current time:')
                    print(strftime("%H:%M:%S", gmtime()))

                    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/'
                    root_dir = root_dir + str(train_target) + '/'

                    filename = root_dir  + str(target) + '/'
                    filename = filename + str(target) + '_' + 'NEW_' + str(sample_size) + 'data_'
                    filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
                    indicator_df.to_csv(filename)

                    end = time.time()
                    elapsed = end - start
                    minutes = elapsed // 60
                    seconds = elapsed - (minutes*60)
                    print('Files took {} minutes and {} seconds to save'.format(minutes,seconds))
                    print ('\n#=========================================================================#')

                    train_count_ls.append(len(training_sample_fp))
                    test_count_ls.append(len(test_sample_fp))

            # Collate all files/results
            print('\nCOLLATING ALL FILES...')
            f_count = 0

            sim_ls = []
            sim_ls.clear()

            mol_threshold_ls = []
            mol_threshold_ls.clear()

            temp_ls = []
            temp_ls.clear()

            for element in T_ls:
                T_sim_threshold = float(element)

                for ele in mol_ls:
                    mol_threshold = int(ele)
                  
                    # Read file 
                    print ('\nReading file ...')
                    print ('Current time:')
                    print(strftime("%H:%M:%S", gmtime())) 
                    start = time.time()

                    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/'
                    root_dir = root_dir + str(train_target) + '/'

                    filename = root_dir + str(target) + '/'
                    filename = filename + str(target) + '_' + 'NEW_' + str(sample_size) + 'data_'
                    filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
                    file1 = pd.read_csv(filename)
                    file1mod = file1.drop(['Unnamed: 0'], axis=1)

                    end = time.time()
                    elapsed = end - start
                    minutes = elapsed // 60
                    seconds = elapsed - (minutes*60)
                    print('File 1 took {} minutes and {} seconds to read'.format(minutes,seconds))

                    if f_count == 0:
                        combined_df = file1mod
                        f_count += 1

                    else:
                        combined_df = pd.concat([combined_df,file1mod],axis = 0)
                        
                    sim_ls.append(T_sim_threshold)
                    mol_threshold_ls.append(mol_threshold)
                    temp_ls.append(target)

            print('\nALL FILES COLLATED')
            #===========================================================================#
            # Process df and save

            print('\nSAVING DF FOR TARGET {}'.format(protein))
            combined_df['Sim'] = sim_ls
            combined_df['No. of mol'] = mol_threshold_ls
            combined_df['Target'] = temp_ls
            combined_df['train_target count'] = train_count_ls
            combined_df['test_target count'] = test_count_ls

            print(combined_df)

            print ('\nSAVING FILES...')
            start = time.time()
            print ('Current time:')
            print(strftime("%H:%M:%S", gmtime()))

            # Create folder in google drive
            root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/'
            root_dir = root_dir + str(train_target) + '/'

            # Create folders 
            foldername = str(target) + '/Combined'
            if os.path.isdir(root_dir + foldername) == False:
                os.mkdir(root_dir + foldername)

            filename = root_dir + str(target) + '/Combined/'
            filename = filename + str(target) + '_Combined_' + 'NEW_' + str(sample_size) + 'data_'
            filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
            combined_df.to_csv(filename)
            
            if target_count == 0:

                #print('FIRST LINE')
                all_df = combined_df
                target_count += 1
            else:
                #print('NEXT LINES')
                all_df = pd.concat([all_df,combined_df],axis=0)

            end = time.time()
            elapsed = end - start
            minutes = elapsed // 60
            seconds = elapsed - (minutes*60)
            print('Files took {} minutes and {} seconds to save'.format(minutes,seconds))
            print ('\n#=========================================================================#')

            # Save all_df (combined df with all calculated targets for easy copying)
            print('\nSAVING ALL RESULTS FILE...')
            root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/- Scrubbing datasets/' + str(train_target) + '/'

            # Create folders 
            foldername = '- All results'
            if os.path.isdir(root_dir + foldername) == False:
                os.mkdir(root_dir + foldername)
 
            filename = root_dir + '- All results/' + 'all_df_' + 'NEW_' + str(sample_size) + 'data_'
            filename = filename + str(nBits) + 'bits_' + str(T_sim_threshold) + 'T_threshold_' + str(mol_threshold) + 'mol_sim_indicator.csv'
            all_df.to_csv(filename)
            print('\nALL RESULTS FILE SAVED')

print('\nFINISHED')



In [None]:
# -*- ChAIkeras.py -*-
"""
Created Oct 2019

author: Timothy E H Allen
"""
#%%

# Import the usual suspects

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow import keras
from tensorflow.keras import layers
from keras import regularizers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.utils import class_weight
import random

# DEFINE INPUTS FOR MODEL TRAINING

for runs in range(1,26):
    # Get test targets
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    filename = root_dir + 'Targets for optimisation.csv'
    receptor_df = pd.read_csv(filename)
    receptor_ls = list(receptor_df['Target'])
    #receptor_ls = ['AChE']

    list1 = []
    list2 = []
    list3 = []
    list4 = []

    # Available receptors:
    train_receptor = 'AChE'
    root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
    # filename = root_dir + '- Scrubbing datasets/scrubbed training and test datasets/'
    # filename = filename + str(receptor) + 'scrubbedtrainsample_1.csv'

    filename = root_dir + '- Scrubbing datasets/raw training and test sets/'
    filename = filename + str(train_receptor) + 'trainsample_1.csv'
    #print(pd.read_csv(filename))
    #input_data_training = "/content/drive/My Drive/" + receptor + "_training_fingerprint.csv"
    input_data_training = filename

    # filename = root_dir + str(receptor) + '/'
    # filename = filename + str(receptor) + '_test_fingerprints ECFP4 10000.csv'
    # input_data_test = filename

    rng_1 = random.randrange(1,1000)
    rng_2 = random.randrange(1,1000)
    validation_proportion = 0.25
    beta = 0.1
    neurons = 100
    hidden_layers = 2
    LR = 0.001
    epochs = 100


    print("Welcome to ChAI")
    print("Dataset loading...")

    # Reading The Dataset

    def read_dataset(input_data):
        df = pd.read_csv(input_data)
        print (df)
        X = df[df.columns[1:10001]].values
        # print(df[df.columns[1:10001]])
        # print(X)
        y = df[df.columns[-1]]
        # print(y)
        # Encode the dependent variable
        encoder = LabelEncoder()
        encoder.fit(y)
        Y = encoder.transform(y)
        #print("X.shape =", X.shape)
        #print("Y.shape =", Y.shape)
        #print("y.shape =", y.shape)
        return (X, Y)

    X, Y = read_dataset(input_data_training)
    # labels = Y
    # positive = 0
    # negative = 0
    # for x in range(0,len(Y)):
    #     check = labels[x]
    #     if check == 0:
    #         negative += 1
    #     if check == 1:
    #         positive += 1

    # print('\nTraining dataset stats')
    # print('Positives: {}'.format(positive))
    # print('Negatives: {}'.format(negative))
    # print('Total: {}'.format(len(labels)))
    # print('Check Total: {}'.format(positive + negative))

    # Shuffle the dataset

    X, Y = shuffle(X, Y, random_state=rng_1)

    # Convert the dataset into train and validation sets

    train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size =validation_proportion, random_state=rng_2)

    # Inspect the shape of the training and validation data

    # print("Dimensionality of data:")
    # print("Train x shape =", train_x.shape)
    # print("Train y shape =", train_y.shape)
    # print("Validation x shape =", valid_x.shape)
    # print("Validation y shape =", valid_y.shape)

    class_weights = class_weight.compute_class_weight('balanced',
                                                    np.unique(train_y),
                                                    train_y)

    #Define the model in keras

    print("Constructing model architecture")

    if hidden_layers == 1:
        inputs = keras.Input(shape=(10000,), name='digits')
        x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
        outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
    elif hidden_layers == 2:
        inputs = keras.Input(shape=(10000,), name='digits')
        x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
        x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_2')(x)
        outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
    elif hidden_layers == 3:
        inputs = keras.Input(shape=(10000,), name='digits')
        x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_1')(inputs)
        x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_2')(x)
        x = layers.Dense(neurons, activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', kernel_regularizer=regularizers.l2(beta), name='dense_3')(x)
        outputs = layers.Dense(2, activation='softmax', name='predictions')(x)
    else:
        print("Number of hidden layers outside this model scope, please choose 1, 2 or 3")

    model = keras.Model(inputs = inputs, outputs = outputs)

    model.compile(optimizer=keras.optimizers.Adam(lr=LR),
                  loss='sparse_categorical_crossentropy',
                  metrics=['sparse_categorical_accuracy'])

    print('Commencing model training...')
    history = model.fit(train_x, train_y,
                        batch_size=128,
                        epochs=epochs,
                        class_weight=class_weights,
                        # We pass some validation for
                        # monitoring validation loss and metrics
                        # at the end of each epoch
                        validation_data=(valid_x, valid_y))

    # The returned "history" object holds a record
    # of the loss values and metric values during training

    # Evaluate the model on the training and validation data
    #print('\n# Evaluate on training data')
    train_results = model.evaluate(train_x, train_y, batch_size=128)
    #print('train loss, train acc:', train_results)

    #print('\n# Evaluate on validation data')
    validation_results = model.evaluate(valid_x, valid_y, batch_size=128)
    #print('validation loss, validation acc:', validation_results)

    # Save the model

    count = 0
    for receptor in receptor_ls:
        model_path = root_dir + '- Scrubbing datasets/Models/' + str(receptor) + 'run' + str(runs) + '_model.h5'
        model.save(model_path)
        print('Model saved to ' + model_path)
        root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
        # filename = root_dir + '- Scrubbing datasets/scrubbed training and test datasets/'
        # filename = filename + str(receptor) + 'scrubbedtestsample_1.csv'

        filename = root_dir + '- Scrubbing datasets/raw training and test sets/'
        filename = filename + str(receptor) + 'testsample_1.csv'
        #print(pd.read_csv(filename))
        input_data_test = filename
        test_x, test_y = read_dataset(input_data_test)
        # labels = test_y
        # positive = 0
        # negative = 0
        # for x in range(0,len(test_y)):
        #     check = labels[x]
        #     if check == 0:
        #         negative += 1
        #     if check == 1:
        #         positive += 1

        # print('\nTest dataset stats')
        # print('Positives: {}'.format(positive))
        # print('Negatives: {}'.format(negative))
        # print('Total: {}'.format(len(labels)))
        # print('Check Total: {}'.format(positive + negative))

        if count == 0:
            pred_valid_y = model.predict(valid_x, verbose=1)
            pred_train_y = model.predict(train_x, verbose=1)
        pred_test_y = model.predict(test_x)

        # Define experimental and predicted values using argmax
        if count == 0:
            pred_train_y_binary = np.argmax(pred_train_y, axis=1)
            pred_valid_y_binary = np.argmax(pred_valid_y, axis=1)
        pred_test_y_binary = np.argmax(pred_test_y, axis=1)

        # Calculate and display confusion matricies
        def get_accuracy(cm):
            TP = cm[0][0]
            FP = cm[0][1]
            FN = cm[1][0]
            TN = cm[1][1]

            accuracy = (TP + TN) / (TP + FP + FN + TN)

            return accuracy

        def manual_cm(true_y, pred_y):
            TP = 0
            FP = 0
            FN = 0
            TN = 0
            for z in range(0,len(true_y)):
                if true_y[z] == 1 and pred_y[z] == 1:
                    TP += 1
                if true_y[z] == 0 and pred_y[z] == 1:
                    FP += 1
                if true_y[z] == 1 and pred_y[z] == 0:
                    FN += 1
                if true_y[z] == 0 and pred_y[z] == 0:
                    TN += 1
              
            return np.array([[TP,FP],[FN,TN]])    


        if count == 0:
            cm = confusion_matrix(train_y, pred_train_y_binary)
            np.set_printoptions(precision=2)
            # print("Confusion matrix (Training), without normalisation")
            # print(cm)
            train_accuracy = get_accuracy(cm)

            cm = confusion_matrix(valid_y, pred_valid_y_binary)
            np.set_printoptions(precision=2)
            # print("Confusion matrix (Validation), without normalisation")
            # print(cm)
            valid_accuracy = get_accuracy(cm)

            count += 1

        cm = confusion_matrix(test_y, pred_test_y_binary)
        np.set_printoptions(precision=2)
        # print("Confusion matrix (Test), without normalisation")
        # print(cm)
        test_accuracy = get_accuracy(cm)

        # Append all values to lists
        list1.append(receptor)
        list2.append(train_accuracy)
        list3.append(valid_accuracy)
        list4.append(test_accuracy)

        # Get dataframe of accuracies
        accuracy_df = pd.DataFrame(list1,columns = ['Receptor'])
        accuracy_df['Train_acc'] = list2
        accuracy_df['Valid_acc'] = list3
        accuracy_df['Test_acc'] = list4
        print(accuracy_df)

        root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
        filename = root_dir + '/- Scrubbing datasets/Accuracies/'
        filename = filename + str(train_receptor) + '_newaccuracies' + str(runs) +  'ECFP4 10000.csv'
        accuracy_df.to_csv(filename)

        print("END")

# Combine all accuracies file for all runs
root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
root_dir = root_dir + '/- Scrubbing datasets/Accuracies/'

for runs in range(1,26):
    if runs == 1:
        filename = root_dir + str(train_receptor) + '_newaccuracies' + str(runs) + 'ECFP4 10000.csv'
        first_file = pd.read_csv(filename)
        first_file = first_file.drop(['Unnamed: 0'],axis=1)
        combined = first_file
    if runs != 1:
        filename = root_dir + str(train_receptor) + '_newaccuracies' + str(runs) +'ECFP4 10000.csv'
        next_file = pd.read_csv(filename)
        next_file = next_file.drop(['Unnamed: 0'],axis=1)
        combined = pd.concat([combined,next_file], ignore_index=True)
    
    # Save combined file
    print(combined)
    filename = root_dir + str(train_receptor) + '_newaccuracies combined ECFP4 10000.csv'
    combined.to_csv(filename)

# approx 5-10 min per model

## Count positives and negative labels in dataset

In [None]:
"""
Created Jun 2021

Author: Marcus Wei How Wang
Code available at: https://github.com/Goodman-lab/AD-transferability
Please acknowledge the authors if using the code, whether partially or in full
"""
# Import the usual suspects
import numpy as np
import pandas as pd


#====================================================================================#
receptor = 'SERT'
root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
filename = root_dir + str(receptor) + '/'
filename = filename + str(receptor) + '_train_fingerprints ECFP4 10000.csv'
train_df = pd.read_csv(filename)

# Count positive and negative labels in training set
labels = train_df.iloc[:,-1]
positive = 0
negative = 0
for x in range(0,len(labels)):
    check = labels.iloc[x]
    if check == 0:
        negative += 1
    if check == 1:
        positive += 1

print('\nTraining dataset stats')
print('Positives: {}'.format(positive))
print('Negatives: {}'.format(negative))
print('Total: {}'.format(len(labels)))
print('Check Total: {}'.format(positive + negative))

root_dir = '/content/drive/My Drive/Applicability domains/Similarities/Similarity indicator/Machine learning/'
filename = root_dir + str(receptor) + '/'
filename = filename + str(receptor) + '_test_fingerprints ECFP4 10000.csv'
test_df = pd.read_csv(filename)

# Count positive and negative labels in test set
labels = test_df.iloc[:,-1]
positive = 0
negative = 0
for x in range(0,len(labels)):
    check = labels.iloc[x]
    if check == 0:
        negative += 1
    if check == 1:
        positive += 1

print('\nTest dataset stats')
print('Positives: {}'.format(positive))
print('Negatives: {}'.format(negative))
print('Total: {}'.format(len(labels)))
print('Check Total: {}'.format(positive + negative))