# Pool-downsample-split

This code merges/pools train and test datasets, takes 10% sample of the pooled data, and splits the downsampled dataset into train and test datasets using 80-20 split.

In [None]:
import os
import pandas as pd

# Load code for Splitting.
%run splitter.py

In [None]:
def pool_sets(train_set_path, test_set_path):
    return pd.concat([pd.read_csv(train_set_path), pd.read_csv(test_set_path)])

def sample_stratified(df_all, frac):
    spl = Splitter(dataframe=df_all, obsCol="SMILES", respCol="VALUE", mode="regression")
    _, sampled = spl.split_stratified(fraction=frac)
    return sampled

def split_stratified(df, frac):
    spl = Splitter(dataframe=df, obsCol="SMILES", respCol="VALUE", mode="regression")
    return spl.split_stratified(fraction=frac)

def pool_sample_split(train_set_path, test_set_path, setname, output_dir="../downsampled-10-percent"):
    pooled = pool_sets(train_set_path, test_set_path)
    sampled = sample_stratified(pooled, frac=0.1)
    train_set, test_set = split_stratified(sampled, frac=0.2)
    train_set.to_csv(path_or_buf=os.path.join(output_dir, "".join([setname, "_train.csv"])), index=False)
    test_set.to_csv(path_or_buf=os.path.join(output_dir, "".join([setname, "_test.csv"])), index=False)
    with open(os.path.join(output_dir, "".join([setname, "_splitting.log"])), 'w') as f:
        f.write(f"Obs. pooled: {len(pooled)}\n")
        f.write(f"Obs. sampled stratified (10%): {len(sampled)}\n")
        f.write(f"Obs. train (10%): {len(train_set)}\n")
        f.write(f"Obs. test (10%): {len(test_set)}\n")

In [None]:
# Run downsampling and splitting.

pool_sample_split("../data/Clearance_set1_train.csv", "../data/Clearance_set1_test.csv", "Clearance_set1")
pool_sample_split("../data/Clearance_set2_train.csv", "../data/Clearance_set2_test.csv", "Clearance_set2")
pool_sample_split("../data/Clearance_set3_train.csv", "../data/Clearance_set3_test.csv", "Clearance_set3")
pool_sample_split("../data/Clearance_set4_train.csv", "../data/Clearance_set4_test.csv", "Clearance_set4")

pool_sample_split("../data/Permeability_set1_train.csv", "../data/Permeability_set1_test.csv", "Permeability_set1")
pool_sample_split("../data/Permeability_set2_train.csv", "../data/Permeability_set2_test.csv", "Permeability_set2")
pool_sample_split("../data/Permeability_set3_train.csv", "../data/Permeability_set3_test.csv", "Permeability_set3")
pool_sample_split("../data/Permeability_set4_train.csv", "../data/Permeability_set4_test.csv", "Permeability_set4")

pool_sample_split("../data/Solubility_set1_train.csv", "../data/Solubility_set1_test.csv", "Solubility_set1")
pool_sample_split("../data/Solubility_set2_train.csv", "../data/Solubility_set2_test.csv", "Solubility_set2")
pool_sample_split("../data/Solubility_set3_train.csv", "../data/Solubility_set3_test.csv", "Solubility_set3")
pool_sample_split("../data/Solubility_set4_train.csv", "../data/Solubility_set4_test.csv", "Solubility_set4")

pool_sample_split("../data/logD_set1_train.csv", "../data/logD_set1_test.csv", "logD_set1")
pool_sample_split("../data/logD_set2_train.csv", "../data/logD_set2_test.csv", "logD_set2")
pool_sample_split("../data/logD_set3_train.csv", "../data/logD_set3_test.csv", "logD_set3")
pool_sample_split("../data/logD_set4_train.csv", "../data/logD_set4_test.csv", "logD_set4")