# Random Splitting for MMPs

In [None]:
import pandas as pd
import numpy as np
import random

from rdkit.Chem import PandasTools
from rdkit import Chem

# load code from the "Optuna_AZ" package that comes in handy here
%run splitter.py

In [None]:
my_path = '../data-initial/'
outpath = '../data/'

In [None]:
name_dict = {
    'logD': 'ST000_logD',
    'Solubility': 'ST000_Solubility',
    'Permeability': 'ST000_Permeability',
    'Clearance': 'ST000_Clearance_hMics'
}

In [None]:
def splitting(df, outpath, name, obsCol="SMILES", respCol="VALUE", fraction=0.2):
    
    # initialize the splitter
    spl = Splitter(dataframe=df,obsCol=obsCol,respCol=respCol,mode="regression")
    
    # make a random split
    train_ran, test_ran = spl.split_randomly(fraction=fraction)
    print("Train (random):", len(train_ran))
    print("Test (random):", len(test_ran))
    
    # save training and test sets
    train_ran.to_csv(outpath + name + '_train.csv', index=False)
    test_ran.to_csv(outpath + name + '_test.csv', index=False)


In [None]:
for name in name_dict:
    print ('\n\n--->: ', name)
    
    # data set 1
    print ('\n---all cpds')
    df_1 = pd.read_csv(my_path + name_dict[name] + '_naa_allData.csv', 
                       usecols = ['SMILES','VALUE'])
    splitting(df_1, outpath, name + '_set1')
    
    # data set 2
    print ('\n---all additive cpds')
    df_2 = pd.read_csv(my_path + name_dict[name] + '_naa_additive.csv',
                       usecols = ['SMILES','VALUE'])
    splitting(df_2, outpath, name + '_set2')
    
    # data set 3
    print ('\n---all mmp cpds')
    df_3 = pd.read_csv(my_path + name_dict[name] + '_mmp_all.csv', 
                       usecols = ['SMILES','VALUE'])
    splitting(df_3, outpath, name + '_set3')
    
    # data set 4
    print ('\n---all NA mmp cpds')
    df_4 = pd.read_csv(my_path + name_dict[name] + '_mmp_NA.csv', 
                       usecols = ['SMILES','VALUE'])
    splitting(df_4, outpath, name + '_set4')
    