## Pretrained VGGNet

In [1]:
# Operations for floydhub
# !git clone https://github.com/machrisaa/tensorflow-vgg tensorflow_vgg
# !ln -s /data .

In [1]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm

vgg_dir = 'tensorflow_vgg/'
vgg_name = 'vgg19'

# Make sure vgg exists
if not isdir(vgg_dir):
    raise Exception("VGG directory doesn't exist!")

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

vgg_param_file = '{}{}.npy'.format(vgg_dir, vgg_name)
if not isfile(vgg_param_file):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc= vgg_name + ' Parameters') as pbar:
        urlretrieve(
            'https://s3.amazonaws.com/content.udacity-data.com/nd101/{}.npy'.format(vgg_name),
            vgg_param_file,
            pbar.hook)
else:
    print("Parameter file already exists!")

Parameter file already exists!


In [2]:
import os

import numpy as np
import tensorflow as tf

from tensorflow_vgg import vgg19
from tensorflow_vgg import utils

In [3]:
data_dir = 'data/'
train_dir = data_dir + 'train/'

classes = [d for d in os.listdir(train_dir) if os.path.isdir(train_dir + d)]

In [16]:
import csv

batch_size = 10
batch = []

with tf.Session() as sess:
    vgg = vgg19.Vgg19()
    input_ = tf.placeholder(tf.float32, [None, 224, 224, 3])
    
    with tf.name_scope("content_vgg"):
        vgg.build(input_)
    
    
    codes = None
    labels = []

    for d_type in ['train', 'valid', 'test']:
        for c in classes:
            image_dir = '{}{}/{}/'.format(data_dir, d_type, c) # e.g. data/train/melanoma/
            files = os.listdir(image_dir)
            for i, file in enumerate(files, 1):
                # load image and resize it to 224x224
                file_path = os.path.join(image_dir, file)
                img = utils.load_image(file_path)
                batch.append(img.reshape((1, 224, 224, 3)))
                labels.append(c)

                if i % batch_size == 0 or i == len(files):
                    images = np.concatenate(batch)

                    feed_dict = {input_: images}
                    codes_batch = sess.run(vgg.relu6, feed_dict=feed_dict)

                    if codes is None:
                        codes = codes_batch
                    else:
                        codes = np.concatenate((codes, codes_batch))

                    batch = []
                    print('data: {}, class: {}, {} / {} images processed'.format(d_type, c, i, len(files)))

        # write codes to file
        with open('{}_codes'.format(d_type), 'w') as f:
            codes.tofile(f)
            codes = None

        # write labels to file
        with open('{}_labels'.format(d_type), 'w') as f:
            writer = csv.writer(f, delimiter='\n')
            writer.writerow(labels)
            labels = []

/Users/junji/Development/udacity-deeplearning/dermatologist-ai/tensorflow_vgg/vgg19.npy
npy file loaded
build model started
build model finished: 1s


  warn("The default mode, 'constant', will be changed to 'reflect' in "


data: train, class: melanoma, 10 / 374 images processed
data: valid, class: melanoma, 10 / 30 images processed
data: test, class: melanoma, 10 / 117 images processed


## Building the Classifier

In [26]:
# read codes and labels from file
import csv

# train data
with open('train_labels') as f:
    reader = csv.reader(f, delimiter='\n')
    train_labels = np.array([each for each in reader if len(each) > 0]).squeeze()
with open('train_codes') as f:
    train_x = np.fromfile(f, dtype=np.float32)
    train_x = train_x.reshape((len(train_labels), -1))
    
# valid data
with open('valid_labels') as f:
    reader = csv.reader(f, delimiter='\n')
    valid_labels = np.array([each for each in reader if len(each) > 0]).squeeze()
with open('valid_codes') as f:
    val_x = np.fromfile(f, dtype=np.float32)
    val_x = val_x.reshape((len(valid_labels), -1))
    
# test data
with open('test_labels') as f:
    reader = csv.reader(f, delimiter='\n')
    test_labels = np.array([each for each in reader if len(each) > 0]).squeeze()
with open('test_codes') as f:
    test_x = np.fromfile(f, dtype=np.float32)
    test_x = test_x.reshape((len(test_labels), -1))

In [27]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
lb.fit(classes)

train_y = lb.transform(train_labels)
val_y = lb.transform(valid_labels)
test_y = lb.transform(test_labels)

In [28]:
print("Train shapes (x, y):", train_x.shape, train_y.shape)
print("Validation shapes (x, y):", val_x.shape, val_y.shape)
print("Test shapes (x, y):", test_x.shape, test_y.shape)

Train shapes (x, y): (10, 4096) (10, 3)
Validation shapes (x, y): (10, 4096) (10, 3)
Test shapes (x, y): (10, 4096) (10, 3)


In [29]:
inputs_ = tf.placeholder(tf.float32, shape=[None, train_x.shape[1]])
labels_ = tf.placeholder(tf.int64, shape=[None, train_y.shape[1]])

fc = tf.contrib.layers.fully_connected(inputs_, 256)
    
logits = tf.contrib.layers.fully_connected(fc, train_y.shape[1], activation_fn=None)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=labels_, logits=logits)
cost = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer().minimize(cost)

predicted = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(predicted, 1), tf.argmax(labels_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [30]:
def get_batches(x, y, n_batches=10):
    """ Return a generator that yields batches from arrays x and y. """
    batch_size = len(x)//n_batches
    
    for ii in range(0, n_batches*batch_size, batch_size):
        # If we're not on the last batch, grab data with size batch_size
        if ii != (n_batches-1)*batch_size:
            X, Y = x[ii: ii+batch_size], y[ii: ii+batch_size] 
        # On the last batch, grab the rest of the data
        else:
            X, Y = x[ii:], y[ii:]
        # I love generators
        yield X, Y

## Training

In [31]:
!mkdir checkpoints

epochs = 10
iteration = 0
saver = tf.train.Saver()
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    for e in range(epochs):
        for x, y in get_batches(train_x, train_y):
            feed = {inputs_: x,
                    labels_: y}
            loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            print("Epoch: {}/{}".format(e+1, epochs),
                  "Iteration: {}".format(iteration),
                  "Training loss: {:.5f}".format(loss))
            iteration += 1
            
            if iteration % 5 == 0:
                feed = {inputs_: val_x, labels_: val_y}
                val_acc = sess.run(accuracy, feed_dict=feed)
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Validation Acc: {:.4f}".format(val_acc))
    saver.save(sess, "checkpoints/skin_diseases.ckpt")

mkdir: checkpoints: File exists
Epoch: 1/10 Iteration: 0 Training loss: 3.11444
Epoch: 1/10 Iteration: 1 Training loss: 0.00000
Epoch: 1/10 Iteration: 2 Training loss: 0.00000
Epoch: 1/10 Iteration: 3 Training loss: 0.00000
Epoch: 1/10 Iteration: 4 Training loss: 0.00000
Epoch: 0/10 Iteration: 5 Validation Acc: 1.0000
Epoch: 1/10 Iteration: 5 Training loss: 0.00000
Epoch: 1/10 Iteration: 6 Training loss: 0.00000
Epoch: 1/10 Iteration: 7 Training loss: 0.00000
Epoch: 1/10 Iteration: 8 Training loss: 0.00000
Epoch: 1/10 Iteration: 9 Training loss: 0.00000
Epoch: 0/10 Iteration: 10 Validation Acc: 1.0000
Epoch: 2/10 Iteration: 10 Training loss: 0.00000
Epoch: 2/10 Iteration: 11 Training loss: 0.00000
Epoch: 2/10 Iteration: 12 Training loss: 0.00000
Epoch: 2/10 Iteration: 13 Training loss: 0.00000
Epoch: 2/10 Iteration: 14 Training loss: 0.00000
Epoch: 1/10 Iteration: 15 Validation Acc: 1.0000
Epoch: 2/10 Iteration: 15 Training loss: 0.00000
Epoch: 2/10 Iteration: 16 Training loss: 0.00000

## Testing

In [32]:
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    
    feed = {inputs_: test_x,
            labels_: test_y}
    test_acc = sess.run(accuracy, feed_dict=feed)
    print("Test accuracy: {:.4f}".format(test_acc))

INFO:tensorflow:Restoring parameters from checkpoints/skin_diseases.ckpt
Test accuracy: 1.0000
