## 1. Libraries 📖

In [1]:
from collections import defaultdict
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import Chem
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed([3,1415])

## 2. Scaffold split

Based on https://chainer-chemistry.readthedocs.io/en/v0.4.0/_modules/chainer_chemistry/dataset/splitters/scaffold_splitter.html

### 2.1 Example

In [2]:
def generate_scaffold(smiles, include_chirality=False):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
    else:
        scaffold = ''
    return scaffold

In [4]:
def get_index(df):
    rng = np.random.RandomState(123)
    smiles_list = list(df.smiles)
    include_chirality = True
    frac_valid = 0.1
    frac_test = 0.2

    scaffolds = defaultdict(list)
    for ind, smiles in enumerate(smiles_list):
        scaffold = generate_scaffold(smiles, include_chirality)
        scaffolds[scaffold].append(ind)

    scaffold_sets = rng.permutation(list(scaffolds.values()))

    n_total_valid = int(np.floor(frac_valid * len(df)))
    n_total_test = int(np.floor(frac_test * len(df)))

    train_index = []
    valid_index = []
    test_index = []

    for scaffold_set in scaffold_sets:
        if len(valid_index) + len(scaffold_set) <= n_total_valid:
            valid_index.extend(scaffold_set)
        elif len(test_index) + len(scaffold_set) <= n_total_test:
            test_index.extend(scaffold_set)
        else:
            train_index.extend(scaffold_set)
    return np.array(train_index), np.array(valid_index), np.array(test_index)

In [None]:
df_smiles = pd.read_csv('data/processed/cardio_smiles_all.csv')
train_index, valid_index, test_index = get_index(df_smiles)

In [8]:
pd.set_option('display.max_columns', None)
data_path = "data/processed/cardio_maccs_all.csv" 
df_fp = pd.read_csv(data_path)
df_fp.head()

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
0,6.19044,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,0
1,5.180456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2,5.79588,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
3,5.173925,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
4,5.180456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0


In [11]:
train_df = df_fp.iloc[train_index, :]
valid_df = df_fp.iloc[valid_index, :]
test_df = df_fp.iloc[test_index, :]

In [12]:
train_df

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
8590,5.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
10156,4.522879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2499,4.886057,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,1,1,1,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0
767,5.251812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
1799,5.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8924,6.699992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,0,1,0,1,1,0,1,0,0,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,1,1,0,1,1,1,1,0,1,1,0,1,1,0,1,0,1,1,1,1,1,1,0
2062,3.782516,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,1,0,0,0,1,1,1,0,1,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0
7209,4.522879,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,1,0,1,1,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,1,1,0
7531,5.619789,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0


In [13]:
train_df['pIC50'].describe()

count    7443.000000
mean        5.236329
std         0.970969
min         0.800000
25%         4.522879
50%         5.000000
75%         5.677781
max         9.585027
Name: pIC50, dtype: float64

In [14]:
test_df['pIC50'].describe()

count    2126.000000
mean        5.218856
std         0.947098
min         1.000000
25%         4.522879
50%         5.000000
75%         5.691440
max         9.853872
Name: pIC50, dtype: float64

In [15]:
valid_df['pIC50'].describe()

count    1063.000000
mean        5.254179
std         0.814707
min         3.477556
25%         4.556743
50%         5.086186
75%         5.708872
max         8.769551
Name: pIC50, dtype: float64

In [11]:
'''
train_df.to_csv('data/train/train.csv', index=False)
test_df.to_csv('data/train/test.csv', index=False)
valid_df.to_csv('data/train/valid.csv', index=False)
'''

In [17]:
train_df_s = df_smiles.iloc[train_index, :]
valid_df_s = df_smiles.iloc[valid_index, :]
test_df_s = df_smiles.iloc[test_index, :]

In [14]:
'''
df.to_csv('data/train/smiles_processed_target.csv', index=False)
train_df_s.to_csv('data/train/train_smiles.csv', index=False)
test_df_s.to_csv('data/train/test_smiles.csv', index=False)
valid_df_s.to_csv('data/train/valid_smiles.csv', index=False)
'''

In [18]:
df_smiles

Unnamed: 0,smiles,pIC50
0,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,6.190440
1,Cc1nc2ccccc2c(=O)n1-c1ccc(OCCCN2CCCCCC2)cc1,5.180456
2,Cc1cccc2c(=O)n(-c3ccc(OCCCN4CCCC4)cc3)c(C)nc12,5.795880
3,Cc1ccc2c(=O)n(-c3ccc(OCCCN4CCCC4)cc3)c(C)nc2c1,5.173925
4,Cc1cccc2nc(C)n(-c3ccc(OCCCN4CCCC4)cc3)c(=O)c12,5.180456
...,...,...
10627,CC1OC2(CCN(CCc3ccccc3F)CC2)CN(c2ncccc2C(F)(F)F...,5.490000
10628,Cn1nccc1C1CCCC1Oc1cc(F)c(S(=O)(=O)Nc2nccs2)cc1F,4.500000
10629,OC(c1cccnc1-c1cccc(Cl)c1)C(c1cccnc1)c1cccnc1,5.000000
10630,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,4.522879


## 3. Random split

In [2]:
def train_validate_test_split(df, train_percent=.7, validate_percent=.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    return perm[:train_end], perm[train_end:validate_end], perm[validate_end:]

In [3]:
def get_dfs(df,train_idx, val_idx, test_idx):
    train = df.iloc[train_idx]
    validate = df.iloc[val_idx]
    test = df.iloc[test_idx]
    return train, validate, test

### 3.1. Cardiotoxicity

In [4]:
df_cardio_maccs = pd.read_csv("data/processed/cardio_maccs_all.csv")
df_cardio_pubchem = pd.read_csv("data/processed/cardio_pubchem_all.csv")
df_cardio_klek = pd.read_csv("data/processed/cardio_klek_all.csv")
df_cardio_smiles = pd.read_csv("data/processed/cardio_smiles_all.csv")

In [5]:
cardio_train_idx, cardio_val_idx, cardio_test_idx = train_validate_test_split(df_cardio_maccs, train_percent=.7, validate_percent=.1, seed=1234)

In [6]:
cardio_train_idx

array([ 1648,  2269, 10620, ...,  4328,  8487,  3425])

In [7]:
cardio_maccs_train, cardio_maccs_valid, cardio_maccs_test = get_dfs(df_cardio_maccs,cardio_train_idx, cardio_val_idx, cardio_test_idx)
cardio_pubchem_train, cardio_pubchem_valid, cardio_pubchem_test = get_dfs(df_cardio_pubchem,cardio_train_idx, cardio_val_idx, cardio_test_idx)
cardio_klek_train, cardio_klek_valid, cardio_klek_test = get_dfs(df_cardio_klek,cardio_train_idx, cardio_val_idx, cardio_test_idx)
cardio_smiles_train, cardio_smiles_valid, cardio_smiles_test = get_dfs(df_cardio_smiles,cardio_train_idx, cardio_val_idx, cardio_test_idx)

In [8]:
cardio_maccs_train

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
1648,5.000000,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2269,4.537602,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,1,0
10620,4.522879,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
5217,7.869666,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
9435,4.000000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4914,5.408935,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
10563,5.481486,0,0,0,0,0,0,0,1,0,...,1,1,1,0,1,0,1,1,1,0
4328,5.241088,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
8487,4.795880,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0


In [9]:
cardio_maccs_test

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
5035,4.397940,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1976,5.823909,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
5215,4.906578,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
8787,5.346787,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
2207,4.481486,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,5.366532,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,1,1,0
7540,4.958607,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
7221,7.522879,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1318,5.000000,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0


In [10]:
cardio_maccs_valid

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
3309,5.000000,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
2054,5.318759,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
9809,4.769551,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1248,5.366532,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1288,6.142065,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,4.886057,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4105,5.000000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4274,4.823909,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
10475,5.031517,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [11]:
cardio_maccs_train.to_csv('data/train/cardio/cardio_maccs_train.csv', index=False)
cardio_maccs_valid.to_csv('data/train/cardio/cardio_maccs_valid.csv', index=False)
cardio_maccs_test.to_csv('data/train/cardio/cardio_maccs_test.csv', index=False)

cardio_pubchem_train.to_csv('data/train/cardio/cardio_pubchem_train.csv', index=False)
cardio_pubchem_valid.to_csv('data/train/cardio/cardio_pubchem_valid.csv', index=False)
cardio_pubchem_test.to_csv('data/train/cardio/cardio_pubchem_test.csv', index=False)

cardio_klek_train.to_csv('data/train/cardio/cardio_klek_train.csv', index=False)
cardio_klek_valid.to_csv('data/train/cardio/cardio_klek_valid.csv', index=False)
cardio_klek_test.to_csv('data/train/cardio/cardio_klek_test.csv', index=False)

cardio_smiles_train.to_csv('data/train/cardio/cardio_smiles_train.csv', index=False)
cardio_smiles_valid.to_csv('data/train/cardio/cardio_smiles_valid.csv', index=False)
cardio_smiles_test.to_csv('data/train/cardio/cardio_smiles_test.csv', index=False)

### 3.2. Hepatotoxicity

In [74]:
df_hepato_alt_maccs = pd.read_csv("data/processed/hepato_alt_maccs_all.csv")
df_hepato_alt_pubchem = pd.read_csv("data/processed/hepato_alt_pubchem_all.csv")
df_hepato_alt_klek = pd.read_csv("data/processed/hepato_alt_klek_all.csv")
df_hepato_alt_smiles = pd.read_csv("data/processed/hepato_alt_smiles_all.csv")

df_hepato_td_maccs = pd.read_csv("data/processed/hepato_ptd50_maccs_all.csv")
df_hepato_td_pubchem = pd.read_csv("data/processed/hepato_ptd50_pubchem_all.csv")
df_hepato_td_klek = pd.read_csv("data/processed/hepato_ptd50_klek_all.csv")
df_hepato_td_smiles = pd.read_csv("data/processed/hepato_ptd50_smiles_all.csv")

In [75]:
hepato_alt_train_idx, hepato_alt_val_idx, hepato_alt_test_idx = train_validate_test_split(df_hepato_alt_maccs, train_percent=.7, validate_percent=.1, seed=1234)
hepato_td_train_idx, hepato_td_val_idx, hepato_td_test_idx = train_validate_test_split(df_hepato_td_maccs, train_percent=.7, validate_percent=.1, seed=1234)

In [76]:
hepato_alt_train_idx

array([63, 36, 54, 62, 78, 85, 55, 57, 44, 92, 29, 40, 33, 61, 39, 59,  1,
       56, 71,  9, 79, 27, 66, 72, 96, 48, 35, 74,  4, 64, 10, 82, 91, 94,
       70,  7, 75, 21, 18, 68, 84, 22,  6,  8, 41, 16, 45, 20, 25, 51, 77,
       31, 90,  5, 81, 32, 52, 13, 89, 17, 28, 46, 86, 42, 60, 14, 65, 12])

In [77]:
hepato_alt_maccs_train, hepato_alt_maccs_valid, hepato_alt_maccs_test = get_dfs(df_hepato_alt_maccs,hepato_alt_train_idx, hepato_alt_val_idx, hepato_alt_test_idx)
hepato_alt_pubchem_train, hepato_alt_pubchem_valid, hepato_alt_pubchem_test = get_dfs(df_hepato_alt_pubchem,hepato_alt_train_idx, hepato_alt_val_idx, hepato_alt_test_idx)
hepato_alt_klek_train, hepato_alt_klek_valid, hepato_alt_klek_test = get_dfs(df_hepato_alt_klek,hepato_alt_train_idx, hepato_alt_val_idx, hepato_alt_test_idx)
hepato_alt_smiles_train, hepato_alt_smiles_valid, hepato_alt_smiles_test = get_dfs(df_hepato_alt_smiles,hepato_alt_train_idx, hepato_alt_val_idx, hepato_alt_test_idx)

hepato_td_maccs_train, hepato_td_maccs_valid, hepato_td_maccs_test = get_dfs(df_hepato_td_maccs,hepato_td_train_idx, hepato_td_val_idx, hepato_td_test_idx)
hepato_td_pubchem_train, hepato_td_pubchem_valid, hepato_td_pubchem_test = get_dfs(df_hepato_td_pubchem,hepato_td_train_idx, hepato_td_val_idx, hepato_td_test_idx)
hepato_td_klek_train, hepato_td_klek_valid, hepato_td_klek_test = get_dfs(df_hepato_td_klek,hepato_td_train_idx, hepato_td_val_idx, hepato_td_test_idx)
hepato_td_smiles_train, hepato_td_smiles_valid, hepato_td_smiles_test = get_dfs(df_hepato_td_smiles,hepato_td_train_idx, hepato_td_val_idx, hepato_td_test_idx)

In [78]:
hepato_alt_maccs_train

Unnamed: 0,ALT,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
63,36.60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
36,1.51,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
54,74.90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,1,1,0,1,0,1,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,1,1,0,0,0,1,0,1,0,0,1,1,1,0,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
62,21.60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
78,30.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,61.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,1,1,1,0,1,0,1,1,1,1,0,0,0,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,0,1,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0
60,37.80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0
14,273.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,1,1,0,0,0,1,0,1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0
65,31.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0,0,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0


In [79]:
hepato_alt_maccs_test

Unnamed: 0,ALT,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
93,38.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,0
50,20.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,1,1,0,0,0,1,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,1,1,1,0,1,1,0,0,0,1,1,0,1,1,0,1,1,1,1,0,0,0,1,0,1,0,1,0,1,0,1,1,0,1,1,1,1,0,1,1,0,1,1,1,1,1,0
97,60.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,0
73,41.9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,1,1,1,1,0,0,1,1,0,1,1,1,0,0,1,0,1,1,0,1,1,1,1,0
80,45.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0
69,57.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0,0,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0
58,33.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0
88,20.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,0
87,49.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,0
43,44.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0


In [80]:
hepato_alt_maccs_valid

Unnamed: 0,ALT,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
19,57.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0,1,1,1,1,0,1,1,0,0,1,0,1,1,1,0,1,0,1,0,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,1,1,1,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,0
2,2.92,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
3,3.16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,1,1,1,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0
0,32.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,0
11,28.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0
67,50.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0
95,53.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,1,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,1,1,1,0
34,30.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
37,1.89,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,1,0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,1,1,1,0,0,0,1,1,1,0,1,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0


In [81]:
hepato_alt_maccs_train.to_csv('data/train/hepato/hepato_alt_maccs_train.csv', index=False)
hepato_alt_maccs_valid.to_csv('data/train/hepato/hepato_alt_maccs_valid.csv', index=False)
hepato_alt_maccs_test.to_csv('data/train/hepato/hepato_alt_maccs_test.csv', index=False)

hepato_alt_pubchem_train.to_csv('data/train/hepato/hepato_alt_pubchem_train.csv', index=False)
hepato_alt_pubchem_valid.to_csv('data/train/hepato/hepato_alt_pubchem_valid.csv', index=False)
hepato_alt_pubchem_test.to_csv('data/train/hepato/hepato_alt_pubchem_test.csv', index=False)

hepato_alt_klek_train.to_csv('data/train/hepato/hepato_alt_klek_train.csv', index=False)
hepato_alt_klek_valid.to_csv('data/train/hepato/hepato_alt_klek_valid.csv', index=False)
hepato_alt_klek_test.to_csv('data/train/hepato/hepato_alt_klek_test.csv', index=False)

hepato_alt_smiles_train.to_csv('data/train/hepato/hepato_alt_smiles_train.csv', index=False)
hepato_alt_smiles_valid.to_csv('data/train/hepato/hepato_alt_smiles_valid.csv', index=False)
hepato_alt_smiles_test.to_csv('data/train/hepato/hepato_alt_smiles_test.csv', index=False)

In [82]:
hepato_td_maccs_train.to_csv('data/train/hepato/hepato_td_maccs_train.csv', index=False)
hepato_td_maccs_valid.to_csv('data/train/hepato/hepato_td_maccs_valid.csv', index=False)
hepato_td_maccs_test.to_csv('data/train/hepato/hepato_td_maccs_test.csv', index=False)

hepato_td_pubchem_train.to_csv('data/train/hepato/hepato_td_pubchem_train.csv', index=False)
hepato_td_pubchem_valid.to_csv('data/train/hepato/hepato_td_pubchem_valid.csv', index=False)
hepato_td_pubchem_test.to_csv('data/train/hepato/hepato_td_pubchem_test.csv', index=False)

hepato_td_klek_train.to_csv('data/train/hepato/hepato_td_klek_train.csv', index=False)
hepato_td_klek_valid.to_csv('data/train/hepato/hepato_td_klek_valid.csv', index=False)
hepato_td_klek_test.to_csv('data/train/hepato/hepato_td_klek_test.csv', index=False)

hepato_td_smiles_train.to_csv('data/train/hepato/hepato_td_smiles_train.csv', index=False)
hepato_td_smiles_valid.to_csv('data/train/hepato/hepato_td_smiles_valid.csv', index=False)
hepato_td_smiles_test.to_csv('data/train/hepato/hepato_td_smiles_test.csv', index=False)

### 3.3. Genotoxicity

In [83]:
df_geno_maccs = pd.read_csv("data/processed/geno_maccs_all.csv")
df_geno_pubchem = pd.read_csv("data/processed/geno_pubchem_all.csv")
df_geno_klek = pd.read_csv("data/processed/geno_klek_all.csv")
df_geno_smiles = pd.read_csv("data/processed/geno_smiles_all.csv")

In [84]:
geno_train_idx, geno_val_idx, geno_test_idx = train_validate_test_split(df_geno_maccs, train_percent=.7, validate_percent=.1, seed=1234)

In [85]:
geno_train_idx

array([ 7461, 24836,  3667, ..., 12429, 24258, 11822])

In [86]:
geno_maccs_train, geno_maccs_valid, geno_maccs_test = get_dfs(df_geno_maccs,geno_train_idx, geno_val_idx, geno_test_idx)
geno_pubchem_train, geno_pubchem_valid, geno_pubchem_test = get_dfs(df_geno_pubchem,geno_train_idx, geno_val_idx, geno_test_idx)
geno_klek_train, geno_klek_valid, geno_klek_test = get_dfs(df_geno_klek,geno_train_idx, geno_val_idx, geno_test_idx)
geno_smiles_train, geno_smiles_valid, geno_smiles_test = get_dfs(df_geno_smiles,geno_train_idx, geno_val_idx, geno_test_idx)

In [87]:
geno_maccs_train

Unnamed: 0,genotoxicity,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
7461,5.350003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0,1,0,1,1,0,1,1,1,1,1,1,1,0
24836,4.786389,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,1,1,0,1,0,0,1,0,1,1,1,1,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0
3667,5.386391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0
24382,4.750000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,1,1,0
8071,4.486391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,4.636213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,1,1,0,0,0,1,1,1,1,1,1,0,1,0,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
22682,5.086213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,1,1,1,1,0,1,1,0,0,1,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,1,1,1,0,1,0,1,1,1,0,1,1,0
12429,4.486391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
24258,4.736390,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,1,0,1,0,1,0,0,0,0,1,1,1,1,0,1,0,0,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0


In [88]:
geno_maccs_test

Unnamed: 0,genotoxicity,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
3497,6.536256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,0,0,0,1,1,0,1,0,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
4691,4.686213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,1,0,1,1,0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,1,1,0
10584,5.186392,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,1,0
9536,4.636390,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1,1,1,0,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1,1,1,1,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0
22511,4.686213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1,0,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,5.736388,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,1,1,1,1,0,1,1,1,0,0,1,0,1,1,1,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0
17048,4.636213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0
23924,6.286174,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
23605,4.486391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,0


In [89]:
geno_maccs_valid

Unnamed: 0,genotoxicity,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
12922,4.936389,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,1,1,1,1,0,1,0,1,1,0,1,1,1,1,0
537,4.736212,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,1,0,1,1,1,0,0,1,0,0,0,0,0,1,0,1,0,1,1,1,0,0,1,0,1,0,1,0,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0
10678,4.786389,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,1,1,1,0,0,0,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0
21777,5.436388,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,1,1,0,0,1,1,0,1,0,0,0,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,1,1,1,0,0,0,1,1,1,0,1,1,0,0,1,0,1,1,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,1,1,0,0,1,1,0,0,1,1,1,1,1,0,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0
1203,4.636213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,1,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,1,1,1,0,1,0,1,1,1,1,0,1,1,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,4.550000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,1,1,0,1,0,1,1,1,0,1,1,0
11837,4.949999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,0
5298,5.586214,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0
6119,5.586399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0


In [90]:
geno_maccs_train.to_csv('data/train/geno/geno_maccs_train.csv', index=False)
geno_maccs_valid.to_csv('data/train/geno/geno_maccs_valid.csv', index=False)
geno_maccs_test.to_csv('data/train/geno/geno_maccs_test.csv', index=False)

geno_pubchem_train.to_csv('data/train/geno/geno_pubchem_train.csv', index=False)
geno_pubchem_valid.to_csv('data/train/geno/geno_pubchem_valid.csv', index=False)
geno_pubchem_test.to_csv('data/train/geno/geno_pubchem_test.csv', index=False)

geno_klek_train.to_csv('data/train/geno/geno_klek_train.csv', index=False)
geno_klek_valid.to_csv('data/train/geno/geno_klek_valid.csv', index=False)
geno_klek_test.to_csv('data/train/geno/geno_klek_test.csv', index=False)

geno_smiles_train.to_csv('data/train/geno/geno_smiles_train.csv', index=False)
geno_smiles_valid.to_csv('data/train/geno/geno_smiles_valid.csv', index=False)
geno_smiles_test.to_csv('data/train/geno/geno_smiles_test.csv', index=False)

### 3.4. Caco-2 permeability

In [12]:
df_caco_maccs = pd.read_csv("data/processed/caco_maccs_all.csv")
df_caco_pubchem = pd.read_csv("data/processed/caco_pubchem_all.csv")
df_caco_klek = pd.read_csv("data/processed/caco_klek_all.csv")
df_caco_smiles = pd.read_csv("data/processed/caco_smiles_all.csv")

In [13]:
caco_train_idx, caco_val_idx, caco_test_idx = train_validate_test_split(df_caco_maccs, train_percent=.7, validate_percent=.1, seed=1234)

In [14]:
caco_train_idx

array([2609, 1898, 1196, ..., 1213,   80, 1295])

In [15]:
caco_maccs_train, caco_maccs_valid, caco_maccs_test = get_dfs(df_caco_maccs,caco_train_idx, caco_val_idx, caco_test_idx)
caco_pubchem_train, caco_pubchem_valid, caco_pubchem_test = get_dfs(df_caco_pubchem,caco_train_idx, caco_val_idx, caco_test_idx)
caco_klek_train, caco_klek_valid, caco_klek_test = get_dfs(df_caco_klek,caco_train_idx, caco_val_idx, caco_test_idx)
caco_smiles_train, caco_smiles_valid, caco_smiles_test = get_dfs(df_caco_smiles,caco_train_idx, caco_val_idx, caco_test_idx)

In [16]:
caco_maccs_train

Unnamed: 0,permeability,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
2609,3.14,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1898,5.48,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
1196,70.00,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
83,0.10,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
1510,1.47,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,1.91,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
3260,17.00,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1213,0.10,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
80,5.40,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [17]:
caco_maccs_test

Unnamed: 0,permeability,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
1194,22.000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2251,0.960,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
1274,0.700,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,1,0
1427,271.000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1840,10.000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3276,2.100,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
3125,0.053,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1318,10.250,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
723,17.000,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [18]:
caco_maccs_valid

Unnamed: 0,permeability,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
2430,0.076,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
206,0.500,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2345,27.000,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,0,1,0
1581,3.400,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2831,33.110,0,0,0,0,0,0,0,1,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2539,13.000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
968,12.000,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
2144,0.800,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1036,18.300,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [98]:
caco_maccs_train.to_csv('data/train/caco/caco_maccs_train.csv', index=False)
caco_maccs_valid.to_csv('data/train/caco/caco_maccs_valid.csv', index=False)
caco_maccs_test.to_csv('data/train/caco/caco_maccs_test.csv', index=False)

caco_pubchem_train.to_csv('data/train/caco/caco_pubchem_train.csv', index=False)
caco_pubchem_valid.to_csv('data/train/caco/caco_pubchem_valid.csv', index=False)
caco_pubchem_test.to_csv('data/train/caco/caco_pubchem_test.csv', index=False)

caco_klek_train.to_csv('data/train/caco/caco_klek_train.csv', index=False)
caco_klek_valid.to_csv('data/train/caco/caco_klek_valid.csv', index=False)
caco_klek_test.to_csv('data/train/caco/caco_klek_test.csv', index=False)

caco_smiles_train.to_csv('data/train/caco/caco_smiles_train.csv', index=False)
caco_smiles_valid.to_csv('data/train/caco/caco_smiles_valid.csv', index=False)
caco_smiles_test.to_csv('data/train/caco/caco_smiles_test.csv', index=False)

### 3.5. Protein plasma binding

In [20]:
df_protein_maccs = pd.read_csv("data/processed/protein_maccs_all.csv")
df_protein_pubchem = pd.read_csv("data/processed/protein_pubchem_all.csv")
df_protein_klek = pd.read_csv("data/processed/protein_klek_all.csv")
df_protein_smiles = pd.read_csv("data/processed/protein_smiles_all.csv")

In [21]:
protein_train_idx, protein_val_idx, protein_test_idx = train_validate_test_split(df_protein_maccs, train_percent=.7, validate_percent=.1, seed=1234)

In [22]:
protein_train_idx

array([2079,  567, 1946, ..., 2027, 1271,  764])

In [23]:
protein_maccs_train, protein_maccs_valid, protein_maccs_test = get_dfs(df_protein_maccs,protein_train_idx, protein_val_idx, protein_test_idx)
protein_pubchem_train, protein_pubchem_valid, protein_pubchem_test = get_dfs(df_protein_pubchem,protein_train_idx, protein_val_idx, protein_test_idx)
protein_klek_train, protein_klek_valid, protein_klek_test = get_dfs(df_protein_klek,protein_train_idx, protein_val_idx, protein_test_idx)
protein_smiles_train, protein_smiles_valid, protein_smiles_test = get_dfs(df_protein_smiles,protein_train_idx, protein_val_idx, protein_test_idx)

In [24]:
protein_maccs_train

Unnamed: 0,protein_plasma_binding,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
2079,90.0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
567,98.7,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1946,99.5,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
507,98.0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,1,0
462,97.0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,94.0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
2510,89.0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2027,82.0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,0,1,1,1,0
1271,83.0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [25]:
protein_maccs_test

Unnamed: 0,protein_plasma_binding,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
1513,93.0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1498,96.0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
945,91.4,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1883,99.9,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1240,86.0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,99.9,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2041,99.0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
664,99.2,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1318,94.0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [26]:
protein_maccs_valid

Unnamed: 0,protein_plasma_binding,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
1350,23.00,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1247,96.00,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
1334,98.39,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2583,85.20,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2082,64.00,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2067,97.80,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
278,99.00,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1262,100.00,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
886,63.90,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0


In [28]:
protein_maccs_train.to_csv('data/train/protein/protein_maccs_train.csv', index=False)
protein_maccs_valid.to_csv('data/train/protein/protein_maccs_valid.csv', index=False)
protein_maccs_test.to_csv('data/train/protein/protein_maccs_test.csv', index=False)

protein_pubchem_train.to_csv('data/train/protein/protein_pubchem_train.csv', index=False)
protein_pubchem_valid.to_csv('data/train/protein/protein_pubchem_valid.csv', index=False)
protein_pubchem_test.to_csv('data/train/protein/protein_pubchem_test.csv', index=False)

protein_klek_train.to_csv('data/train/protein/protein_klek_train.csv', index=False)
protein_klek_valid.to_csv('data/train/protein/protein_klek_valid.csv', index=False)
protein_klek_test.to_csv('data/train/protein/protein_klek_test.csv', index=False)

protein_smiles_train.to_csv('data/train/protein/protein_smiles_train.csv', index=False)
protein_smiles_valid.to_csv('data/train/protein/protein_smiles_valid.csv', index=False)
protein_smiles_test.to_csv('data/train/protein/protein_smiles_test.csv', index=False)

### 3.6. Solubility

In [4]:
df_solub_maccs = pd.read_csv("data/processed/solubility_maccs_all.csv")
df_solub_pubchem = pd.read_csv("data/processed/solubility_pubchem_all.csv")
df_solub_klek = pd.read_csv("data/processed/solubility_klek_all.csv")
df_solub_smiles = pd.read_csv("data/processed/solubility_smiles_all.csv")

In [5]:
solub_train_idx, solub_val_idx, solub_test_idx = train_validate_test_split(df_solub_maccs, train_percent=.7, validate_percent=.1, seed=1234)

In [6]:
solub_train_idx

array([  77, 1212,  156, ..., 1334,  502, 1390])

In [7]:
solub_maccs_train, solub_maccs_valid, solub_maccs_test = get_dfs(df_solub_maccs,solub_train_idx, solub_val_idx, solub_test_idx)
solub_pubchem_train, solub_pubchem_valid, solub_pubchem_test = get_dfs(df_solub_pubchem,solub_train_idx, solub_val_idx, solub_test_idx)
solub_klek_train, solub_klek_valid, solub_klek_test = get_dfs(df_solub_klek,solub_train_idx, solub_val_idx, solub_test_idx)
solub_smiles_train, solub_smiles_valid, solub_smiles_test = get_dfs(df_solub_smiles,solub_train_idx, solub_val_idx, solub_test_idx)

In [8]:
solub_maccs_train

Unnamed: 0,solubility,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
77,4.910095,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
1212,5.408935,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
156,3.770062,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
1249,4.349692,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
990,5.602060,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1359,6.096910,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
532,4.459671,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1334,5.283997,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
502,2.959991,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [9]:
solub_maccs_test

Unnamed: 0,solubility,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
617,3.560036,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
101,3.260032,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,0,1,0
1109,5.721246,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
1706,4.000000,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1164,3.929962,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228,5.267606,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
1077,5.552842,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1318,4.179799,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
723,4.070070,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0


In [10]:
solub_maccs_valid

Unnamed: 0,solubility,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
1238,5.142668,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
720,4.140261,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
223,2.869988,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
835,4.149967,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,1,0
144,5.161151,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1388,5.050610,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1422,3.730020,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
611,4.600326,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1126,5.823909,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [11]:
solub_maccs_train.to_csv('data/train/solubility/solub_maccs_train.csv', index=False)
solub_maccs_valid.to_csv('data/train/solubility/solub_maccs_valid.csv', index=False)
solub_maccs_test.to_csv('data/train/solubility/solub_maccs_test.csv', index=False)

solub_pubchem_train.to_csv('data/train/solubility/solub_pubchem_train.csv', index=False)
solub_pubchem_valid.to_csv('data/train/solubility/solub_pubchem_valid.csv', index=False)
solub_pubchem_test.to_csv('data/train/solubility/solub_pubchem_test.csv', index=False)

solub_klek_train.to_csv('data/train/solubility/solub_klek_train.csv', index=False)
solub_klek_valid.to_csv('data/train/solubility/solub_klek_valid.csv', index=False)
solub_klek_test.to_csv('data/train/solubility/solub_klek_test.csv', index=False)

solub_smiles_train.to_csv('data/train/solubility/solub_smiles_train.csv', index=False)
solub_smiles_valid.to_csv('data/train/solubility/solub_smiles_valid.csv', index=False)
solub_smiles_test.to_csv('data/train/solubility/solub_smiles_test.csv', index=False)