# **ООП**

Курс по ооп на ютуб с написанием класса и использованием его.

In [31]:
class Employee:

    def __init__(self, first, last, pay):
        self.first = first
        self.last = last
        self.email = first + '.' + last + '@email.com'
        self.pay = pay

    def fullname(self):
        return '{} {}'.format(self.first, self.last)

emp_1 = Employee('Corey', 'Schafer', 50000)
emp_2 = Employee('Test', 'Employee', 60000)

In [32]:
class Employee:

    num_of_emps = 0
    raise_amt = 1.04

    def __init__(self, first, last, pay):
        self.first = first
        self.last = last
        self.email = first + '.' + last + '@email.com'
        self.pay = pay

        Employee.num_of_emps += 1

    def fullname(self):
        return '{} {}'.format(self.first, self.last)

    def apply_raise(self):
        self.pay = int(self.pay * self.raise_amt)

    @classmethod
    def set_raise_amt(cls, amount):
        cls.raise_amt = amount

    @classmethod
    def from_string(cls, emp_str):
        first, last, pay = emp_str.split('-')
        return cls(first, last, pay)

    @staticmethod
    def is_workday(day):
        if day.weekday() == 5 or day.weekday() == 6:
            return False
        return True


emp_1 = Employee('Corey', 'Schafer', 50000)
emp_2 = Employee('Test', 'Employee', 60000)

Employee.set_raise_amt(1.05)

print(Employee.raise_amt)
print(emp_1.raise_amt)
print(emp_2.raise_amt)

emp_str_1 = 'John-Doe-70000'
emp_str_2 = 'Steve-Smith-30000'
emp_str_3 = 'Jane-Doe-90000'

first, last, pay = emp_str_1.split('-')

#new_emp_1 = Employee(first, last, pay)
new_emp_1 = Employee.from_string(emp_str_1)

print(new_emp_1.email)
print(new_emp_1.pay)

import datetime
my_date = datetime.date(2016, 7, 11)

print(Employee.is_workday(my_date))

1.05
1.05
1.05
John.Doe@email.com
70000
True


In [36]:
class Employee:

    raise_amt = 1.04

    def __init__(self, first, last, pay):
        self.first = first
        self.last = last
        self.email = first + '.' + last + '@email.com'
        self.pay = pay

    def fullname(self):
        return '{} {}'.format(self.first, self.last)

    def apply_raise(self):
        self.pay = int(self.pay * self.raise_amt)


class Developer(Employee):
    raise_amt = 1.10

    def __init__(self, first, last, pay, prog_lang):
        super().__init__(first, last, pay)
        self.prog_lang = prog_lang


class Manager(Employee):

    def __init__(self, first, last, pay, employees=None):
        super().__init__(first, last, pay)
        if employees is None:
            self.employees = []
        else:
            self.employees = employees

    def add_emp(self, emp):
        if emp not in self.employees:
            self.employees.append(emp)

    def remove_emp(self, emp):
        if emp in self.employees:
            self.employees.remove(emp)

    def print_emps(self):
        for emp in self.employees:
            print('-->', emp.fullname())


dev_1 = Developer('Corey', 'Schafer', 50000, 'Python')
dev_2 = Developer('Test', 'Employee', 60000, 'Java')

mgr_1 = Manager('Sue', 'Smith', 90000, [dev_1])

print(mgr_1.email)

mgr_1.add_emp(dev_2)
mgr_1.remove_emp(dev_2)

mgr_1.print_emps()


Sue.Smith@email.com
--> Corey Schafer


In [37]:
class Employee:

    raise_amt = 1.04

    def __init__(self, first, last, pay):
        self.first = first
        self.last = last
        self.email = first + '.' + last + '@email.com'
        self.pay = pay

    def fullname(self):
        return '{} {}'.format(self.first, self.last)

    def apply_raise(self):
        self.pay = int(self.pay * self.raise_amt)

    def __repr__(self):
        return "Employee('{}', '{}', {})".format(self.first, self.last, self.pay)

    def __str__(self):
        return '{} - {}'.format(self.fullname(), self.email)

    def __add__(self, other):
        return self.pay + other.pay

    def __len__(self):
        return len(self.fullname())


emp_1 = Employee('Corey', 'Schafer', 50000)
emp_2 = Employee('Test', 'Employee', 60000)

# print(emp_1 + emp_2)

print(len(emp_1))

13


In [38]:
class Employee:

    def __init__(self, first, last):
        self.first = first
        self.last = last

    @property
    def email(self):
        return '{}.{}@email.com'.format(self.first, self.last)

    @property
    def fullname(self):
        return '{} {}'.format(self.first, self.last)
    
    @fullname.setter
    def fullname(self, name):
        first, last = name.split(' ')
        self.first = first
        self.last = last
    
    @fullname.deleter
    def fullname(self):
        print('Delete Name!')
        self.first = None
        self.last = None


emp_1 = Employee('John', 'Smith')
emp_1.fullname = "Corey Schafer"

print(emp_1.first)
print(emp_1.email)
print(emp_1.fullname)

del emp_1.fullname

Corey
Corey.Schafer@email.com
Corey Schafer
Delete Name!


#  Working With Datasets

Data is central to machine learning.  This tutorial introduces the `Dataset` class that DeepChem uses to store and manage data.  It provides simple but powerful tools for efficiently working with large amounts of data.  It also is designed to easily interact with other popular Python frameworks such as NumPy, Pandas, TensorFlow, and PyTorch.

In [1]:
!pip install --pre deepchem

Collecting deepchem
  Downloading deepchem-2.6.0.dev20211026183818-py3-none-any.whl (610 kB)
[K     |████████████████████████████████| 610 kB 5.0 MB/s 
Installing collected packages: deepchem
Successfully installed deepchem-2.6.0.dev20211026183818


In [2]:
pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2021.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.4 MB)
[K     |████████████████████████████████| 20.4 MB 36.8 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2021.9.2


We can now import the `deepchem` package to play with.

In [3]:
import deepchem as dc
dc.__version__

'2.6.0.dev'

# Anatomy of a Dataset

The Delaney dataset of molecular solubilities: https://github.com/deepchem/deepchem/blob/master/datasets/delaney-processed.csv

In [4]:
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

We now have three Dataset objects: the training, validation, and test sets.  What information does each of them contain?  We can start to get an idea by printing out the string representation of one of them.

In [5]:
train_dataset

<DiskDataset X.shape: (902,), y.shape: (902, 1), w.shape: (902, 1), ids: ['CC(C)=CCCC(C)=CC(=O)' 'CCCC=C' 'CCCCCCCCCCCCCC' ...
 'Nc2cccc3nc1ccccc1cc23 ' 'C1CCCCCC1' 'OC1CCCCCC1'], task_names: ['measured log solubility in mols per litre']>

In [6]:
print(test_dataset)

<DiskDataset X.shape: (113,), y.shape: (113, 1), w.shape: (113, 1), ids: ['c1cc2ccc3cccc4ccc(c1)c2c34' 'Cc1cc(=O)[nH]c(=S)[nH]1'
 'Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4 ' ...
 'c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43' 'Cc1occc1C(=O)Nc2ccccc2'
 'OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O '], task_names: ['measured log solubility in mols per litre']>


There's a lot of information there, so let's start at the beginning.  It begins with the label "DiskDataset".  Dataset is an abstract class.  It has a few subclasses that correspond to different ways of storing data.

- `DiskDataset` is a dataset that has been saved to disk.  The data is stored in a way that can be efficiently accessed, even if the total amount of data is far larger than your computer's memory.
- `NumpyDataset` is an in-memory dataset that holds all the data in NumPy arrays.  It is a useful tool when manipulating small to medium sized datasets that can fit entirely in memory.
- `ImageDataset` is a more specialized class that stores some or all of the data in image files on disk.  It is useful when working with models that have images as their inputs or outputs.

Now let's consider the contents of the Dataset.  Every Dataset stores a list of *samples*.  Very roughly speaking, a sample is a single data point.  In this case, each sample is a molecule.  In other datasets a sample might correspond to an experimental assay, a cell line, an image, or many other things.  For every sample the dataset stores the following information.

- The *features*, referred to as `X`.  This is the input that should be fed into a model to represent the sample.
- The *labels*, referred to as `y`.  This is the desired output from the model.  During training, it tries to make the model's output for each sample as close as possible to `y`.
- The *weights*, referred to as `w`.  This can be used to indicate that some data values are more important than others.  In later tutorials we will see examples of how this is useful.
- An *ID*, which is a unique identifier for the sample.  This can be anything as long as it is unique.  Sometimes it is just an integer index, but in this dataset the ID is a SMILES string describing the molecule.

Notice that `X`, `y`, and `w` all have 113 as the size of their first dimension.  That means this dataset contains 113 samples.

The final piece of information listed in the output is `task_names`.  Some datasets contain multiple pieces of information for each sample.  For example, if a sample represents a molecule, the dataset might record the results of several different experiments on that molecule.  This dataset has only a single task: "measured log solubility in mols per litre".  Also notice that `y` and `w` each have shape (113, 1).  The second dimension of these arrays usually matches the number of tasks.

# Accessing Data from a Dataset

There are many ways to access the data contained in a dataset.  The simplest is just to directly access the `X`, `y`, `w`, and `ids` properties.  Each of these returns the corresponding information as a NumPy array.

In [7]:
test_dataset.ids

array(['c1cc2ccc3cccc4ccc(c1)c2c34', 'Cc1cc(=O)[nH]c(=S)[nH]1',
       'Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4 ',
       'c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45', 'C1=Cc2cccc3cccc1c23',
       'CC1CO1', 'CCN2c1ccccc1N(C)C(=S)c3cccnc23 ',
       'CC12CCC3C(CCc4cc(O)ccc34)C2CCC1=O',
       'Cn2cc(c1ccccc1)c(=O)c(c2)c3cccc(c3)C(F)(F)F',
       'ClC(Cl)(Cl)C(NC=O)N1C=CN(C=C1)C(NC=O)C(Cl)(Cl)Cl ',
       'COc2c1occc1cc3ccc(=O)oc23 ',
       'CN2C(=C(O)c1ccccc1S2(=O)=O)C(=O)Nc3ccccn3 ',
       'Cc3cc2nc1c(=O)[nH]c(=O)nc1n(CC(O)C(O)C(O)CO)c2cc3C',
       'c1ccc(cc1)c2ccc(cc2)c3ccccc3',
       'CC34CC(=O)C1C(CCC2=CC(=O)CCC12C)C3CCC4(=O) ',
       'c1ccc2c(c1)sc3ccccc23',
       'CC23Cc1cnoc1C=C2CCC4C3CCC5(C)C4CCC5(O)C#C',
       'OC(C(=O)c1ccccc1)c2ccccc2', 'OCC2OC(Oc1ccccc1CO)C(O)C(O)C2O',
       'CC3C2CCC1(C)C=CC(=O)C(=C1C2OC3=O)C', 'O=Cc2ccc1OCOc1c2 ',
       'CC1CCCCC1NC(=O)Nc2ccccc2',
       'CC(=O)N(S(=O)c1ccc(N)cc1)c2onc(C)c2C ',
       'C1N(C(=O)NCC(C)C)C(=O)NC1', 'CNC(=O)Oc1ccccc1C2OCCO2

This is a very easy way to access data, but you should be very careful about using it.  This requires the data for all samples to be loaded into memory at once.  That's fine for small datasets like this one, but for large datasets it could easily take more memory than you have.

A better approach is to iterate over the dataset.  That lets it load just a little data at a time, process it, then free the memory before loading the next bit.  You can use the `itersamples()` method to iterate over samples one at a time.

In [8]:
for X, y, w, id in test_dataset.itersamples():
    print(y, id)

[-1.60114461] c1cc2ccc3cccc4ccc(c1)c2c34
[0.20848251] Cc1cc(=O)[nH]c(=S)[nH]1
[-0.01602738] Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4 
[-2.82191713] c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45
[-0.52891635] C1=Cc2cccc3cccc1c23
[1.10168349] CC1CO1
[-0.88987406] CCN2c1ccccc1N(C)C(=S)c3cccnc23 
[-0.52649706] CC12CCC3C(CCc4cc(O)ccc34)C2CCC1=O
[-0.76358725] Cn2cc(c1ccccc1)c(=O)c(c2)c3cccc(c3)C(F)(F)F
[-0.64020358] ClC(Cl)(Cl)C(NC=O)N1C=CN(C=C1)C(NC=O)C(Cl)(Cl)Cl 
[-0.38569452] COc2c1occc1cc3ccc(=O)oc23 
[-0.62568785] CN2C(=C(O)c1ccccc1S2(=O)=O)C(=O)Nc3ccccn3 
[-0.39585553] Cc3cc2nc1c(=O)[nH]c(=O)nc1n(CC(O)C(O)C(O)CO)c2cc3C
[-2.05306753] c1ccc(cc1)c2ccc(cc2)c3ccccc3
[-0.29666474] CC34CC(=O)C1C(CCC2=CC(=O)CCC12C)C3CCC4(=O) 
[-0.73213651] c1ccc2c(c1)sc3ccccc23
[-1.27744393] CC23Cc1cnoc1C=C2CCC4C3CCC5(C)C4CCC5(O)C#C
[0.0081655] OC(C(=O)c1ccccc1)c2ccccc2
[0.97588054] OCC2OC(Oc1ccccc1CO)C(O)C(O)C2O
[-0.10796031] CC3C2CCC1(C)C=CC(=O)C(=C1C2OC3=O)C
[0.59847167] O=Cc2ccc1OCOc1c2 
[-0.60149498] CC1CCCCC1NC(=O)N

Most deep learning models can process a batch of multiple samples all at once.  You can use `iterbatches()` to iterate over batches of samples.

In [9]:
for X, y, w, ids in test_dataset.iterbatches(batch_size=50):
    print(y.shape)

(50, 1)
(50, 1)
(13, 1)


`iterbatches()` has other features that are useful when training models.  For example, `iterbatches(batch_size=100, epochs=10, deterministic=False)` will iterate over the complete dataset ten times, each time with the samples in a different random order.

Datasets can also expose data using the standard interfaces for TensorFlow and PyTorch.  To get a `tensorflow.data.Dataset`, call `make_tf_dataset()`.  To get a `torch.utils.data.IterableDataset`, call `make_pytorch_dataset()`.  See the API documentation for more details.

The final way of accessing data is `to_dataframe()`.  This copies the data into a Pandas `DataFrame`.  This requires storing all the data in memory at once, so you should only use it with small datasets.

In [10]:
test_dataset.to_dataframe()

Unnamed: 0,X,y,w,ids
0,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-1.601145,1.0,c1cc2ccc3cccc4ccc(c1)c2c34
1,<deepchem.feat.mol_graphs.ConvMol object at 0x...,0.208483,1.0,Cc1cc(=O)[nH]c(=S)[nH]1
2,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-0.016027,1.0,Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4
3,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-2.821917,1.0,c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45
4,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-0.528916,1.0,C1=Cc2cccc3cccc1c23
...,...,...,...,...
108,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-1.656304,1.0,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl
109,<deepchem.feat.mol_graphs.ConvMol object at 0x...,0.743629,1.0,c1ccsc1
110,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-2.420799,1.0,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
111,<deepchem.feat.mol_graphs.ConvMol object at 0x...,-0.209570,1.0,Cc1occc1C(=O)Nc2ccccc2


# Creating Datasets

Now let's talk about how you can create your own datasets.  Creating a `NumpyDataset` is very simple: just pass the arrays containing the data to the constructor.  Let's create some random arrays, then wrap them in a NumpyDataset.

In [11]:
import numpy as np

X = np.random.random((10, 5))
y = np.random.random((10, 2))
dataset = dc.data.NumpyDataset(X=X, y=y)
print(dataset)

<NumpyDataset X.shape: (10, 5), y.shape: (10, 2), w.shape: (10, 1), ids: [0 1 2 3 4 5 6 7 8 9], task_names: [0 1]>


In [12]:
dataset

<NumpyDataset X.shape: (10, 5), y.shape: (10, 2), w.shape: (10, 1), ids: [0 1 2 3 4 5 6 7 8 9], task_names: [0 1]>

Notice that we did not specify weights or IDs.  These are optional, as is `y` for that matter.  Only `X` is required.  Since we left them out, it automatically built `w` and `ids` arrays for us, setting all weights to 1 and setting the IDs to integer indices.

In [13]:
dataset.to_dataframe()

Unnamed: 0,X1,X2,X3,X4,X5,y1,y2,w,ids
0,0.807527,0.016863,0.541549,0.675612,0.246332,0.593415,0.105518,1.0,0
1,0.420006,0.452954,0.86575,0.24101,0.515808,0.263764,0.436931,1.0,1
2,0.41541,0.88573,0.38779,0.95561,0.055231,0.8124,0.288368,1.0,2
3,0.167741,0.12021,0.041519,0.284706,0.962192,0.08148,0.295286,1.0,3
4,0.602042,0.385652,0.503271,0.013634,0.47031,0.324513,0.502949,1.0,4
5,0.902019,0.342711,0.461573,0.955236,0.458136,0.768045,0.919536,1.0,5
6,0.049351,0.073925,0.92466,0.757646,0.421659,0.514853,0.25207,1.0,6
7,0.417806,0.916411,0.875386,0.190135,0.170035,0.543366,0.545429,1.0,7
8,0.1282,0.068048,0.464277,0.721848,0.861078,0.294488,0.774635,1.0,8
9,0.602816,0.750494,0.996557,0.524671,0.474758,0.210928,0.193611,1.0,9


What about creating a DiskDataset?  If you have the data in NumPy arrays, you can call `DiskDataset.from_numpy()` to save it to disk.  Since this is just a tutorial, we will save it to a temporary directory.

In [14]:
import tempfile

with tempfile.TemporaryDirectory() as data_dir:
    disk_dataset = dc.data.DiskDataset.from_numpy(X=X, y=y, data_dir=data_dir)

In [15]:
 disk_dataset = dc.data.DiskDataset.from_numpy(X=X, y=y, data_dir=data_dir)

In [16]:
disk_dataset

<DiskDataset X.shape: (10, 5), y.shape: (10, 2), w.shape: (10, 1), ids: [0 1 2 3 4 5 6 7 8 9], task_names: [0 1]>

What about larger datasets that can't fit in memory?  What if you have some huge files on disk containing data on hundreds of millions of molecules?  The process for creating a DiskDataset from them is slightly more involved.  Fortunately, DeepChem's `DataLoader` framework can automate most of the work for you.  That is a larger subject, so we will return to it in a later tutorial.

# Model Training

In [17]:
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

In [18]:
valid_dataset

<DiskDataset X.shape: (113,), y.shape: (113, 1), w.shape: (113, 1), ids: ['Nc1ncnc2nc[nH]c12 ' 'Nc1nc(O)nc2nc[nH]c12 '
 'Fc1cccc(F)c1C(=O)NC(=O)Nc2cc(Cl)c(F)c(Cl)c2F ' ...
 'OC(Cn1cncn1)(Cn2cncn2)c3ccc(F)cc3F '
 'FC(F)(F)c1cccc(c1)N2CC(CCl)C(Cl)C2=O' 'CC1(C)CON(Cc2ccccc2Cl)C1=O'], task_names: ['measured log solubility in mols per litre']>

In [19]:
model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)

In [20]:
model.fit(train_dataset, nb_epoch=50)

  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." %

0.16760950088500975

In [21]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric], transformers))
print("Test set score:", model.evaluate(test_dataset, [metric], transformers))

Training set score: {'pearson_r2_score': 0.8654287834745752}
Test set score: {'pearson_r2_score': 0.6136101898455377}


In [22]:
solubilities = model.predict_on_batch(test_dataset.X[:10])
for molecule, solubility, test_solubility in zip(test_dataset.ids, solubilities, test_dataset.y):
    print(solubility, test_solubility, molecule)

[-1.0156411] [-1.60114461] c1cc2ccc3cccc4ccc(c1)c2c34
[1.352353] [0.20848251] Cc1cc(=O)[nH]c(=S)[nH]1
[-0.3577692] [-0.01602738] Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4 
[-1.3251768] [-2.82191713] c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45
[-0.6120652] [-0.52891635] C1=Cc2cccc3cccc1c23
[1.3723112] [1.10168349] CC1CO1
[-0.12451547] [-0.88987406] CCN2c1ccccc1N(C)C(=S)c3cccnc23 
[-0.9751379] [-0.52649706] CC12CCC3C(CCc4cc(O)ccc34)C2CCC1=O
[-1.271826] [-0.76358725] Cn2cc(c1ccccc1)c(=O)c(c2)c3cccc(c3)C(F)(F)F
[-0.33278984] [-0.64020358] ClC(Cl)(Cl)C(NC=O)N1C=CN(C=C1)C(NC=O)C(Cl)(Cl)Cl 


In [23]:
solubilities

array([[-1.0156411 ],
       [ 1.352353  ],
       [-0.3577692 ],
       [-1.3251768 ],
       [-0.6120652 ],
       [ 1.3723112 ],
       [-0.12451547],
       [-0.9751379 ],
       [-1.271826  ],
       [-0.33278984]], dtype=float32)

# Advanced model training

### Hyperparameter Optimization


Let's start by loading the HIV dataset. It classifies over 40,000 molecules based on whether they inhibit HIV replication.



In [24]:

tasks, datasets, transformers = dc.molnet.load_hiv(featurizer='ECFP', splitter='scaffold')
train_dataset, valid_dataset, test_dataset = datasets

In [25]:
train_dataset.y[:10]

array([[0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

Now let's train a model on it. We will use a MultitaskClassifier, which is just a stack of dense layers. But that still leaves a lot of options. How many layers should there be, and how wide should each one be? What dropout rate should we use? What learning rate?

These are called hyperparameters. The standard way to select them is to try lots of values, train each model on the training set, and evaluate it on the validation set. This lets us see which ones work best.

You could do that by hand, but usually it's easier to let the computer do it for you. DeepChem provides a selection of hyperparameter optimization algorithms, which are found in the dc.hyper package. For this example we'll use GridHyperparamOpt, which is the most basic method. We just give it a list of options for each hyperparameter and it exhaustively tries all combinations of them.

The lists of options are defined by a dict that we provide. For each of the model's arguments, we provide a list of values to try. In this example we consider three possible sets of hidden layers: a single layer of width 500, a single layer of width 1000, or two layers each of width 1000. We also consider two dropout rates (20% and 50%) and two learning rates (0.001 and 0.0001).

In [26]:
params_dict = {
    'n_tasks': [len(tasks)],
    'n_features': [1024],
    'layer_sizes': [[500], [1000], [1000, 1000]],
    'dropouts': [0.2, 0.5],
    'learning_rate': [0.001, 0.0001]
}
optimizer = dc.hyper.GridHyperparamOpt(dc.models.MultitaskClassifier)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, metric, transformers)

hyperparam_search() returns three arguments: the best model it found, the hyperparameters for that model, and a full listing of the validation score for every model. Let's take a look at the last one.



In [27]:
best_hyperparams

{'dropouts': 0.5,
 'layer_sizes': [1000],
 'learning_rate': 0.001,
 'n_features': 1024,
 'n_tasks': 1}

In [28]:
all_results


{'_dropouts_0.200000_layer_sizes[1000, 1000]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.772286216441309,
 '_dropouts_0.200000_layer_sizes[1000, 1000]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7313130756417794,
 '_dropouts_0.200000_layer_sizes[1000]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7672707843425435,
 '_dropouts_0.200000_layer_sizes[1000]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7668819199490495,
 '_dropouts_0.200000_layer_sizes[500]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7746040931804821,
 '_dropouts_0.200000_layer_sizes[500]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7725372942386832,
 '_dropouts_0.500000_layer_sizes[1000, 1000]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7609050436018028,
 '_dropouts_0.500000_layer_sizes[1000, 1000]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7583705357142858,
 '_dropouts_0.500000_layer_sizes[1000]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7517407

We can see a few general patterns. Using two layers with the larger learning rate doesn't work very well. It seems the deeper model requires a smaller learning rate. We also see that 20% dropout usually works better than 50%. Once we narrow down the list of models based on these observations, all the validation scores are very close to each other, probably close enough that the remaining variation is mainly noise. It doesn't seem to make much difference which of the remaining hyperparameter sets we use, so let's arbitrarily pick a single layer of width 1000 and learning rate of 0.0001.


In [29]:
best_model



MultitaskClassifier(activation_fns=None, bias_init_consts=None, dropouts=None,
                    layer_sizes=None, n_classes=2, n_features=1024, n_tasks=1,
                    residual=None, weight_decay_penalty=None,
                    weight_decay_penalty_type=None, weight_init_stddevs=None)

## Early Stopping


There is one other important hyperparameter we haven't considered yet: how long we train the model for. GridHyperparamOpt trains each for a fixed, fairly small number of epochs. That isn't necessarily the best number.

You might expect that the longer you train, the better your model will get, but that isn't usually true. If you train too long, the model will usually start overfitting to irrelevant details of the training set. You can tell when this happens because the validation set score stops increasing and may even decrease, while the score on the training set continues to improve.

Fortunately, we don't need to train lots of different models for different numbers of steps to identify the optimal number. We just train it once, monitor the validation score, and keep whichever parameters maximize it. This is called "early stopping". DeepChem's ValidationCallback class can do this for us automatically. In the example below, we have it compute the validation set's ROC AUC every 1000 training steps. If you add the save_dir argument, it will also save a copy of the best model parameters to disk.

In [30]:
model = dc.models.MultitaskClassifier(n_tasks=len(tasks),
                                      n_features=1024,
                                      layer_sizes=[1000],
                                      dropouts=0.2,
                                      learning_rate=0.0001)
callback = dc.models.ValidationCallback(valid_dataset, 1000, metric)
model.fit(train_dataset, nb_epoch=50, callbacks=callback)

Step 1000 validation: roc_auc_score=0.741894
Step 2000 validation: roc_auc_score=0.773571
Step 3000 validation: roc_auc_score=0.77484
Step 4000 validation: roc_auc_score=0.777733
Step 5000 validation: roc_auc_score=0.761533
Step 6000 validation: roc_auc_score=0.769077
Step 7000 validation: roc_auc_score=0.776145
Step 8000 validation: roc_auc_score=0.758731
Step 9000 validation: roc_auc_score=0.760951
Step 10000 validation: roc_auc_score=0.762702
Step 11000 validation: roc_auc_score=0.767951
Step 12000 validation: roc_auc_score=0.760645
Step 13000 validation: roc_auc_score=0.773707
Step 14000 validation: roc_auc_score=0.760608
Step 15000 validation: roc_auc_score=0.761227
Step 16000 validation: roc_auc_score=0.765002


0.6852291107177735