In [1]:
project_id = 'rising-sea-112358'
zone = 'us-central1-b'
cluster_id = 'my-cluster'
bucket_name = 'hpsearch'
source_dir = 'source'
image_name = 'the-image'

In [2]:
from cloudbuild_helper import build

build(project_id, source_dir, bucket_name, image_name)

{u'metadata': {u'@type': u'type.googleapis.com/google.devtools.cloudbuild.v1.BuildOperationMetadata',
  u'build': {u'createTime': u'2017-09-28T21:24:12.171411710Z',
   u'id': u'989405d5-589b-40ba-ac6d-a1ca61b96dbb',
   u'images': [u'gcr.io/rising-sea-112358/the-image'],
   u'logUrl': u'https://console.cloud.google.com/gcr/builds/989405d5-589b-40ba-ac6d-a1ca61b96dbb?project=rising-sea-112358',
   u'logsBucket': u'gs://817831893817.cloudbuild-logs.googleusercontent.com',
   u'projectId': u'rising-sea-112358',
   u'source': {u'storageSource': {u'bucket': u'hpsearch',
     u'object': u'source.zip'}},
   u'sourceProvenance': {u'resolvedStorageSource': {u'bucket': u'hpsearch',
     u'generation': u'1506633851354808',
     u'object': u'source.zip'}},
   u'status': u'QUEUED',
   u'steps': [{u'args': [u'build',
      u'-t',
      u'gcr.io/rising-sea-112358/the-image',
      u'.'],
     u'name': u'gcr.io/cloud-builders/docker'}],
   u'timeout': u'600s'}},
 u'name': u'operations/build/rising-sea-

In [3]:
from gke_helper import create_cluster

create_cluster(project_id, zone, cluster_id, n_nodes=4, machine_type='n1-standard-4')

{u'name': u'operation-1506634097640-21d2910e',
 u'operationType': u'CREATE_CLUSTER',
 u'selfLink': u'https://container.googleapis.com/v1/projects/817831893817/zones/us-central1-b/operations/operation-1506634097640-21d2910e',
 u'startTime': u'2017-09-28T21:28:17.640530867Z',
 u'status': u'RUNNING',
 u'targetLink': u'https://container.googleapis.com/v1/projects/817831893817/zones/us-central1-b/clusters/my-cluster',
 u'zone': u'us-central1-b'}

In [4]:
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle

mnist = fetch_mldata('MNIST original', data_home='./mnist_data')
X, y = shuffle(mnist.data[:60000], mnist.target[:60000])

In [5]:
X_small = X[:100]
y_small = y[:100]

In [6]:
X_large = X[:6000]
y_large = y[:6000]

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV



In [8]:
gbc = GradientBoostingClassifier()
param_grid = {
    'learning_rate': [0.1, 0.5, 1.0],
    'n_estimators': [100, 50, 200, 300, 400],
    'max_depth': [2, 3, 4, 5],
    'subsample': [1.0, 0.9, 0.8]
}
search = GridSearchCV(estimator=gbc, param_grid=param_grid, n_jobs=-1, verbose=3)

In [None]:
%time search.fit(X_small, y_small)

In [None]:
# TODO: experiment to find out how the running time scales with sample size

In [None]:
search.best_score_

In [None]:
search.best_params_

In [9]:
from gke_parallel import GKEParallel

In [10]:
gke_search = GKEParallel(search, project_id, zone, cluster_id, bucket_name, image_name, task_name='test-cluster.hpsearch.1506628901')

In [11]:
! bash get_cluster_credentials.sh $cluster_id

Fetching cluster endpoint and auth data.
kubeconfig entry generated for my-cluster.
Context "gke_rising-sea-112358_us-central1-b_my-cluster" modified.
NAME                                        STATUS    AGE       VERSION
gke-my-cluster-default-pool-4b61562f-bp81   Ready     1m        v1.7.5
gke-my-cluster-default-pool-4b61562f-czqz   Ready     1m        v1.7.5
gke-my-cluster-default-pool-4b61562f-d3wm   Ready     1m        v1.7.5
gke-my-cluster-default-pool-4b61562f-khj6   Ready     1m        v1.7.5


In [12]:
gke_search.fit(X_large, y_large)

Persised the GKEParallel instance: gs://hpsearch/my-cluster.the-image.1506634436/fitted_gke_search.pkl


In [13]:
gke_search.task_name, gke_search.n_nodes

('my-cluster.the-image.1506634436', 4)

In [14]:
len(gke_search.job_names)

15

In [None]:
#gke_search.cancel()

In [16]:
gke_search.param_grids

{'0': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [100],
  'subsample': [1.0]},
 '1': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [100],
  'subsample': [0.9]},
 '10': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [300],
  'subsample': [0.9]},
 '11': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [300],
  'subsample': [0.8]},
 '12': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [400],
  'subsample': [1.0]},
 '13': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [400],
  'subsample': [0.9]},
 '14': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [400],
  'subsample': [0.8]},
 '2': {'learning_rate': [0.1, 0.5, 1.0],
  'max_depth': [2, 3, 4, 5],
  'n_estimators': [100],
  'subsample': [0.8]},
 '3': {'learning_rate': [0.1, 0.5, 1.0],
  'max_dep

In [28]:
gke_search.done(), gke_search.dones

(True,
 {'0': True,
  '1': True,
  '10': True,
  '11': True,
  '12': True,
  '13': True,
  '14': True,
  '2': True,
  '3': True,
  '4': True,
  '5': True,
  '6': True,
  '7': True,
  '8': True,
  '9': True})

In [29]:
result = gke_search.result()

Persised the GKEParallel instance: gs://hpsearch/my-cluster.the-image.1506634436/fitted_gke_search.pkl


In [30]:
gke_search.best_score_, gke_search.best_params_

(0.93149999999999999,
 {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400, 'subsample': 0.8})

In [34]:
# You can also call predict(), which deligates the call to the best_estimator_

print(gke_search.best_estimator_.predict(mnist.data[64000:64005]))

print(mnist.target[64000:64005])

[ 3.  3.  3.  3.  3.]
[ 3.  3.  3.  3.  3.]


In [35]:
test = gke_search.best_estimator_.predict(mnist.data[60000:])
labels = mnist.target[60000:]
len([i for i in xrange(len(test)) if test[i] == labels[i]])

In [36]:
len([i for i in xrange(len(test)) if test[i] == labels[i]])

9403

In [40]:
result['0'].best_score_

0.92300000000000004

In [42]:
gbc_ = GradientBoostingClassifier()

%time gbc_.fit(X_large, y_large)

CPU times: user 4min 18s, sys: 1.07 s, total: 4min 19s
Wall time: 4min 20s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [43]:
test_ = gbc_.predict(mnist.data[60000:])
labels = mnist.target[60000:]
len([i for i in xrange(len(test)) if test_[i] == labels[i]])

9216