## Libraries 📖

In [1]:
from collections import defaultdict
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import Chem
import numpy as np
import pandas as pd

## Scaffold split

Based on https://chainer-chemistry.readthedocs.io/en/v0.4.0/_modules/chainer_chemistry/dataset/splitters/scaffold_splitter.html

In [2]:
def generate_scaffold(smiles, include_chirality=False):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
    else:
        scaffold = ''
    return scaffold

In [3]:
df = pd.read_csv('data/smiles_processed.csv')
rng = np.random.RandomState(123)
smiles_list = list(df.smiles)
include_chirality = True
frac_valid = 0.1
frac_test = 0.2

scaffolds = defaultdict(list)
for ind, smiles in enumerate(smiles_list):
    scaffold = generate_scaffold(smiles, include_chirality)
    scaffolds[scaffold].append(ind)

scaffold_sets = rng.permutation(list(scaffolds.values()))

n_total_valid = int(np.floor(frac_valid * len(df)))
n_total_test = int(np.floor(frac_test * len(df)))

train_index = []
valid_index = []
test_index = []

for scaffold_set in scaffold_sets:
    if len(valid_index) + len(scaffold_set) <= n_total_valid:
        valid_index.extend(scaffold_set)
    elif len(test_index) + len(scaffold_set) <= n_total_test:
        test_index.extend(scaffold_set)
    else:
        train_index.extend(scaffold_set)

In [4]:
train_idx = np.array(train_index)
valid_idx =np.array(valid_index)
test_idx = np.array(test_index)

In [5]:
pd.set_option('display.max_columns', None)
data_path = "data/cardio_processed.csv" 
df_fp = pd.read_csv(data_path)
df_fp.head()

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
0,6.19044,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,0
1,5.180456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2,5.79588,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
3,5.173925,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
4,5.180456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0


In [6]:
train_df = df_fp.iloc[train_idx, :]
valid_df = df_fp.iloc[valid_idx, :]
test_df = df_fp.iloc[test_idx, :]

In [7]:
train_df

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,MACCSFP10,MACCSFP11,MACCSFP12,MACCSFP13,MACCSFP14,MACCSFP15,MACCSFP16,MACCSFP17,MACCSFP18,MACCSFP19,MACCSFP20,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,MACCSFP25,MACCSFP26,MACCSFP27,MACCSFP28,MACCSFP29,MACCSFP30,MACCSFP31,MACCSFP32,MACCSFP33,MACCSFP34,MACCSFP35,MACCSFP36,MACCSFP37,MACCSFP38,MACCSFP39,MACCSFP40,MACCSFP41,MACCSFP42,MACCSFP43,MACCSFP44,MACCSFP45,MACCSFP46,MACCSFP47,MACCSFP48,MACCSFP49,MACCSFP50,MACCSFP51,MACCSFP52,MACCSFP53,MACCSFP54,MACCSFP55,MACCSFP56,MACCSFP57,MACCSFP58,MACCSFP59,MACCSFP60,MACCSFP61,MACCSFP62,MACCSFP63,MACCSFP64,MACCSFP65,MACCSFP66,MACCSFP67,MACCSFP68,MACCSFP69,MACCSFP70,MACCSFP71,MACCSFP72,MACCSFP73,MACCSFP74,MACCSFP75,MACCSFP76,MACCSFP77,MACCSFP78,MACCSFP79,MACCSFP80,MACCSFP81,MACCSFP82,MACCSFP83,MACCSFP84,MACCSFP85,MACCSFP86,MACCSFP87,MACCSFP88,MACCSFP89,MACCSFP90,MACCSFP91,MACCSFP92,MACCSFP93,MACCSFP94,MACCSFP95,MACCSFP96,MACCSFP97,MACCSFP98,MACCSFP99,MACCSFP100,MACCSFP101,MACCSFP102,MACCSFP103,MACCSFP104,MACCSFP105,MACCSFP106,MACCSFP107,MACCSFP108,MACCSFP109,MACCSFP110,MACCSFP111,MACCSFP112,MACCSFP113,MACCSFP114,MACCSFP115,MACCSFP116,MACCSFP117,MACCSFP118,MACCSFP119,MACCSFP120,MACCSFP121,MACCSFP122,MACCSFP123,MACCSFP124,MACCSFP125,MACCSFP126,MACCSFP127,MACCSFP128,MACCSFP129,MACCSFP130,MACCSFP131,MACCSFP132,MACCSFP133,MACCSFP134,MACCSFP135,MACCSFP136,MACCSFP137,MACCSFP138,MACCSFP139,MACCSFP140,MACCSFP141,MACCSFP142,MACCSFP143,MACCSFP144,MACCSFP145,MACCSFP146,MACCSFP147,MACCSFP148,MACCSFP149,MACCSFP150,MACCSFP151,MACCSFP152,MACCSFP153,MACCSFP154,MACCSFP155,MACCSFP156,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
8590,5.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
10156,4.522879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2499,4.886057,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,1,1,1,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0
767,5.251812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
1799,5.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8924,6.699992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,0,1,0,1,1,0,1,0,0,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,1,1,0,1,1,1,1,0,1,1,0,1,1,0,1,0,1,1,1,1,1,1,0
2062,3.782516,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,1,0,0,0,1,1,1,0,1,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0
7209,4.522879,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,1,0,1,1,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,1,1,0
7531,5.619789,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0


In [8]:
train_df['pIC50'].describe()

count    7443.000000
mean        5.236329
std         0.970969
min         0.800000
25%         4.522879
50%         5.000000
75%         5.677781
max         9.585027
Name: pIC50, dtype: float64

In [9]:
test_df['pIC50'].describe()

count    2126.000000
mean        5.218856
std         0.947098
min         1.000000
25%         4.522879
50%         5.000000
75%         5.691440
max         9.853872
Name: pIC50, dtype: float64

In [10]:
valid_df['pIC50'].describe()

count    1063.000000
mean        5.254179
std         0.814707
min         3.477556
25%         4.556743
50%         5.086186
75%         5.708872
max         8.769551
Name: pIC50, dtype: float64

In [11]:
train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)
valid_df.to_csv('data/valid.csv', index=False)

In [12]:
df = df.join(df_fp["pIC50"])
train_df_s = df.iloc[train_idx, :]
valid_df_s = df.iloc[valid_idx, :]
test_df_s = df.iloc[test_idx, :]

In [14]:
df.to_csv('data/smiles_processed_target.csv', index=False)
train_df_s.to_csv('data/train_smiles.csv', index=False)
test_df_s.to_csv('data/test_smiles.csv', index=False)
valid_df_s.to_csv('data/valid_smiles.csv', index=False)

In [17]:
df

Unnamed: 0,smiles,pIC50
0,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,6.190440
1,Cc1nc2ccccc2c(=O)n1-c1ccc(OCCCN2CCCCCC2)cc1,5.180456
2,Cc1cccc2c(=O)n(-c3ccc(OCCCN4CCCC4)cc3)c(C)nc12,5.795880
3,Cc1ccc2c(=O)n(-c3ccc(OCCCN4CCCC4)cc3)c(C)nc2c1,5.173925
4,Cc1cccc2nc(C)n(-c3ccc(OCCCN4CCCC4)cc3)c(=O)c12,5.180456
...,...,...
10627,CC1OC2(CCN(CCc3ccccc3F)CC2)CN(c2ncccc2C(F)(F)F...,5.180456
10628,Cn1nccc1C1CCCC1Oc1cc(F)c(S(=O)(=O)Nc2nccs2)cc1F,4.000000
10629,OC(c1cccnc1-c1cccc(Cl)c1)C(c1cccnc1)c1cccnc1,4.522879
10630,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,4.585027


## Random split

In [8]:
def train_validate_test_split(df, train_percent=.7, validate_percent=.1, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [9]:
df1 = pd.read_csv("data/smiles_processed_target.csv")
df2 = pd.read_csv("data/cardio_processed.csv")
df = df2.join(df1["smiles"])

np.random.seed([3,1415])
train, validate, test = train_validate_test_split(df)

In [10]:
train

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166,smiles
1730,4.500000,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,N#Cc1ccc(OCCN2CC3CN(CCNS(=O)(=O)Cc4ccccc4)CC(C...
4982,4.119758,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,O=C1OCCc2cc(C3CN4CCN(C(=O)Cc5ccc(-n6cnnn6)nc5)...
5818,5.339135,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CCOC(=O)N1CCC(CN2CCC(NC(=O)c3cc(Cl)c(N)cc3OC)C...
9016,4.853872,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CCN(C(=O)Cc1ccc(S(C)(=O)=O)cc1)C1CCN(CCC(c2cc(...
2690,4.000000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CN(C)C(=O)C(c1ccc(-c2ccc3ncnn3c2)cc1)C(N)C(=O)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9436,5.853872,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,0,1,0,CNc1nc(NCCN2CCNCC2)c2sc(-c3ccc(C(F)(F)F)cc3)cc2n1
2915,4.100000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,Cc1nc2ccccc2n1C1CC2CCC(C1)N2CCC1(c2ccccc2)CCN(...
10475,5.031517,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CC1CC2CSC(N)=NC2(c2nc(NC(=O)c3ccn(C(F)F)n3)cs2...
7720,5.124939,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CCOc1cc(C(c2cc3cc(Br)ccc3cc2OC)C(O)(CCN(C)C)c2...


In [11]:
test

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166,smiles
9403,5.387216,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,0,COc1ccc(C(=O)Oc2cc(O)cc3oc(-c4ccc(OC)cc4)cc(=O...
1398,4.560667,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,1,1,1,0,CCCCCCC(N1CCOCC1)C(O)(c1cccnc1)c1cccnc1
6033,4.522879,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CC(C)N1CCN(C(=O)c2ccc(NC(=O)Nc3ccc(-c4nc(C5CCO...
1290,5.000000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,COCCOCC#Cc1cc(-c2n[nH]c3c2Cc2ccc(Cn4cncn4)cc2-...
1826,5.275724,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,Cc1nc2ccccc2c(=O)n1-c1ccc(OCCCN2CCCC(C)C2)cc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,4.225483,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,0,1,0,CN(C)c1ccc(C=Cc2c(F)cccc2Cl)cn1
7414,5.090920,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CC1CCCN1CCc1ccc(-c2ccc(S(=O)(=O)NCc3ccncc3)cc2...
6644,4.730487,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,COc1cc(F)ccc1-c1cncc(CNC(=O)C2CCC2)c1
7508,5.920819,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CC(C)C1OC2(CCN(CCc3ccccc3)CC2)CN(c2ccccc2)C1=O


In [12]:
validate

Unnamed: 0,pIC50,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166,smiles
8118,5.119186,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,0,CC1(N)CCC(Nc2c(C(N)=O)cnn3cc(-c4cccc(F)c4)cc23...
6290,5.330683,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,CC(C)Cc1cc(CNS(=O)(=O)c2cccc3cccnc23)nn1-c1ccccc1
9402,7.000000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,Cc1cc(C(N)=O)c(-c2cccc(OC(=O)NCCN3CCN(c4ccccc4...
3181,4.958607,0,0,0,0,0,0,0,1,0,...,1,1,1,1,1,1,1,1,0,Cc1nsc(NC(=O)c2cc(Oc3ccc(C(=O)N4CCC4)cc3)cc(OC...
2254,7.928118,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,COc1ccc(CC(C)(C)NCC(O)COc2cccc3[nH]c(=O)[nH]c2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6623,5.000000,0,0,0,0,0,0,0,1,0,...,1,1,0,1,1,1,1,1,0,NC1=NC2(CO1)c1cc(Br)ccc1OC1(CCCC1)C21COC1
5974,4.100000,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,COCCC(Oc1ncnc2c1cnn2-c1ncccc1Cl)C(=O)Nc1ccc(C)cn1
864,5.180456,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,0,COc1cccc(CN2CCc3cc4nc(N)sc4cc3CC2)c1
7844,5.000000,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,O=C(C1CCN(c2cccc(C(F)(F)F)n2)CC1)N1CCC(NC2CCC(...


In [13]:
df_smiles_tr = train[["pIC50", "smiles"]].copy()
df_smiles_ts = test[["pIC50", "smiles"]].copy()
df_smiles_vl = validate[["pIC50", "smiles"]].copy()

df_finger_tr = train.drop("smiles", axis=1)
df_finger_ts = test.drop("smiles", axis=1)
df_finger_vl = validate.drop("smiles", axis=1)

In [14]:
df_smiles_tr

Unnamed: 0,pIC50,smiles
1730,4.500000,N#Cc1ccc(OCCN2CC3CN(CCNS(=O)(=O)Cc4ccccc4)CC(C...
4982,4.119758,O=C1OCCc2cc(C3CN4CCN(C(=O)Cc5ccc(-n6cnnn6)nc5)...
5818,5.339135,CCOC(=O)N1CCC(CN2CCC(NC(=O)c3cc(Cl)c(N)cc3OC)C...
9016,4.853872,CCN(C(=O)Cc1ccc(S(C)(=O)=O)cc1)C1CCN(CCC(c2cc(...
2690,4.000000,CN(C)C(=O)C(c1ccc(-c2ccc3ncnn3c2)cc1)C(N)C(=O)...
...,...,...
9436,5.853872,CNc1nc(NCCN2CCNCC2)c2sc(-c3ccc(C(F)(F)F)cc3)cc2n1
2915,4.100000,Cc1nc2ccccc2n1C1CC2CCC(C1)N2CCC1(c2ccccc2)CCN(...
10475,5.031517,CC1CC2CSC(N)=NC2(c2nc(NC(=O)c3ccn(C(F)F)n3)cs2...
7720,5.124939,CCOc1cc(C(c2cc3cc(Br)ccc3cc2OC)C(O)(CCN(C)C)c2...


In [15]:
df_smiles_tr.to_csv('data/train_smiles.csv', index=False)
df_smiles_ts.to_csv('data/test_smiles.csv', index=False)
df_smiles_vl.to_csv('data/valid_smiles.csv', index=False)

df_finger_tr.to_csv('data/train.csv', index=False)
df_finger_ts.to_csv('data/test.csv', index=False)
df_finger_vl.to_csv('data/valid.csv', index=False)