In [1]:
### Set colaboratory True to run in Google Colaboratory. 
colab = True

if colab:
  from google.colab import drive
  drive.mount('/content/drive')
  import os
  ## Specify a directory in Google Drive
  dir = '/content/drive/My Drive/Colab Notebooks/Save_Restore_Model'
  os.chdir(dir)
  #os.getcwd()
  #os.listdir()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
if colab:
  ## Check the uptime. (Google Colab reboots every 12 hours)
  !cat /proc/uptime | awk '{print "Uptime is " $1 /60 /60 " hours (" $1 " sec)"}'
  ## Check the GPU info
  !nvidia-smi
  ## Check the Python version
  !python --version

Uptime is 6.64461 hours (23920.61 sec)
Fri Jan 25 05:25:19 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name           

Reference:

https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/keras/save_and_restore_models.ipynb

In [3]:
from __future__ import absolute_import, division, print_function

import os

import tensorflow as tf
from tensorflow import keras

tf.__version__

'1.12.0'

Set up data

In [0]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

train_labels = train_labels[:1000]
test_labels = test_labels[:1000]

train_images = train_images[:1000].reshape(-1, 28 * 28) / 255.0
test_images = test_images[:1000].reshape(-1, 28 * 28) / 255.0

In [5]:
epochs = 100
model_file_path = 'trained_model.h5'
log_file_path = 'training_log.csv'



# Returns a short sequential model
def create_model():
  model = tf.keras.models.Sequential([
    keras.layers.Dense(512, activation=tf.keras.activations.relu, input_shape=(784,)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(10, activation=tf.keras.activations.softmax)
  ])
  
  model.compile(optimizer='adam', 
                loss=tf.keras.losses.sparse_categorical_crossentropy,
                metrics=['accuracy'])
  
  return model


# Create a basic model instance
model = create_model()
model.summary()

# Create checkpoint callback
check_point_ = tf.keras.callbacks.ModelCheckpoint(filepath = model_file_path, 
                                                 monitor = 'val_acc',
                                                 verbose=1,
                                                 save_best_only = True,
                                                 mode='auto',
                                                 save_weights_only=False,
                                                 period = 1)

# Create early stopping callback
early_stopping_ = tf.keras.callbacks.EarlyStopping(monitor='val_acc', 
                                                   min_delta=0, 
                                                   patience=10, 
                                                   verbose=1, 
                                                   mode='auto', 
                                                   baseline=None)

csv_logger_ = tf.keras.callbacks.CSVLogger(filename = log_file_path, separator=',', append=False)

model.fit(train_images, train_labels,  epochs = epochs, 
          validation_data = (test_images,test_labels),
          callbacks = [check_point_, early_stopping_, csv_logger_])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               401920    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                5130      
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________
Train on 1000 samples, validate on 1000 samples
Epoch 1/100
Epoch 00001: val_acc improved from -inf to 0.79900, saving model to trained_model.h5
Epoch 2/100
Epoch 00002: val_acc improved from 0.79900 to 0.82200, saving model to trained_model.h5
Epoch 3/100
Epoch 00003: val_acc improved from 0.82200 to 0.85700, saving model to trained_model.h5
Epoch 4/100
Epoch 00004: val_acc did not improve from 0

<tensorflow.python.keras.callbacks.History at 0x7fd754fc5a90>

In [6]:
os.listdir()

['keras_fit_with_callbacks_mnist.ipynb',
 'training_log.csv',
 'trained_model.h5']

In [7]:
import pandas as pd
pd.options.display.max_rows = 8
log_df = pd.read_csv(log_file_path)
log_df

Unnamed: 0,epoch,acc,loss,val_acc,val_loss
0,0,0.668,1.147379,0.799,0.683778
1,1,0.871,0.430139,0.822,0.557218
2,2,0.918,0.286902,0.857,0.463132
3,3,0.950,0.206597,0.851,0.452389
...,...,...,...,...,...
19,19,1.000,0.008753,0.870,0.429417
20,20,1.000,0.007619,0.876,0.427311
21,21,1.000,0.007250,0.873,0.437199
22,22,1.000,0.006450,0.876,0.430452


References:


https://keras.io/callbacks/

https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint