# 1. Load dataset

In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
data_train = pd.read_csv('CHEMBL1862_Ki/train.csv', header=None)
data_test = pd.read_csv('CHEMBL1862_Ki/test.csv', header=None)

In [3]:
smi_train, prop_train = data_train[0].to_list(), data_train[1].to_list()
smi_test, prop_test = data_test[0].to_list(), data_test[1].to_list()

In [4]:
mols_train, y_train = [], []
for smi, prop in zip(smi_train, prop_train):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_train.append(mol)
        y_train.append(prop)

In [5]:
mols_test, y_test = [], []
for smi, prop in zip(smi_test, prop_test):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_test.append(mol)
        y_test.append(prop)

## 1.5 Reduce the dataset size for faster pipeline reproduction (for playing around)

In [6]:
mols_train, y_train = mols_train[:30], y_train[:30]
mols_test, y_test = mols_test[:10], y_test[:10]

## 2. Descriptor calcualtion

In [7]:
from molfeat.calc import (FPCalculator,
                          RDKitDescriptors2D, 
                          Pharmacophore2D, 
                          MordredDescriptors, 
                          CATS, 
                          ScaffoldKeyCalculator)

from molfeat.trans import MoleculeTransformer

In [8]:
descr_2d = [
            # fingerprints
            (FPCalculator("atompair"), "AtomPairBinary"),
            (FPCalculator("atompair-count"), "AtomPairCount"),
            (FPCalculator("avalon"), "AvalonBinary"),
            (FPCalculator("ecfp"), "ECFPBinary"),
            (FPCalculator("ecfp-count"), "ECFPCount"),
            (FPCalculator("erg"), "ERG"),
            (FPCalculator("estate"), "Estate"),
            (FPCalculator("fcfp"), "FCFPBinary"),
            (FPCalculator("fcfp-count"), "FCFPCount"),
            (FPCalculator("layered"), "Layered"),
            (FPCalculator("maccs"), "MACCS"),
            (FPCalculator("pattern"), "Pattern"),
            (FPCalculator("rdkit"), "RDKitBinary"),
            (FPCalculator("rdkit-count"), "RDKitCount"),
            (FPCalculator("secfp"), "SECFP"),
            (FPCalculator("topological"), "TopologicalBinary"),
            (FPCalculator("topological-count"), "TopologicalCount"),

            # long
            (RDKitDescriptors2D(replace_nan=True), "RDKitDescriptors2D"),
            (Pharmacophore2D(replace_nan=True), "Pharmacophore2D"),
            (MordredDescriptors(replace_nan=True), "MordredDescriptors"),
            (ScaffoldKeyCalculator(), "ScaffoldKey"),
           ]

In [9]:
descr_func = FPCalculator("atompair")
descr_calc = MoleculeTransformer(descr_func)

In [10]:
x_train = descr_calc.transform(mols_train)
x_test = descr_calc.transform(mols_test)

# 3. Model training

In [11]:
from sklearn.preprocessing import MinMaxScaler


from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import r2_score, mean_absolute_error

In [12]:
scaler = MinMaxScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
model = LinearRegression()
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

print(r2_score(y_test, y_pred))

0.6988896040353259


In [14]:
model = RandomForestRegressor()
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

print(r2_score(y_test, y_pred))

0.44418313958614286


In [15]:
model = SVR()
model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

print(r2_score(y_test, y_pred))

0.5076568951287848
