In [2]:
# NOTE: this is a custom cell that contains the common imports I personally 
# use these may/may not be necessary for the following examples

# DL framework
import tensorflow as tf

from datetime import datetime

# common packages
import numpy as np
import os # handling file i/o
import sys
import math
import time # timing epochs

# for ordered dict when building layer components
import collections

# plotting pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import pyplot
from matplotlib import colors # making colors consistent
from mpl_toolkits.axes_grid1 import make_axes_locatable # colorbar helper

# read image
### from imageio import imread
# + data augmentation
from scipy import ndimage
from scipy import misc

# used for manually saving best params
import pickle

# for shuffling data batches
from sklearn.utils import shuffle

# const
SEED = 42

# Helper to make the output consistent
def reset_graph(seed=SEED):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# helper to create dirs if they don't already exist
def maybe_create_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print("{} created".format(dir_path))
    else:
        print("{} already exists".format(dir_path))
    
# set tf log level to supress messages, unless an error
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Important Version information
print("Python: {}".format(sys.version_info[:]))
print('TensorFlow: {}'.format(tf.__version__))

# Check if using GPU
if not tf.test.gpu_device_name():
    print('No GPU')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
    
reset_graph()

Python: (3, 5, 4, 'final', 0)
TensorFlow: 1.4.0
Default GPU Device: /device:GPU:0


In [3]:
# `/record_holder` will (hopefully) contain our tf_records file
# by the end of this notebook
FINAL_DIR = "./record_holder/lesion/balanced/"
maybe_create_dir(FINAL_DIR)

./record_holder/lesion/balanced/ already exists


In [4]:
ROOT_DIR = "./numpy/sigmoid/lesion/224/"

for _, _, files in os.walk(ROOT_DIR):
    files = sorted(files)
    for filename in files:
        print(filename)
        
X_test = np.load(os.path.join(ROOT_DIR, files[0]))
X_train = np.load(os.path.join(ROOT_DIR, files[1]))
X_val = np.load(os.path.join(ROOT_DIR, files[2]))
y_test = np.load(os.path.join(ROOT_DIR, files[3]))
y_train = np.load(os.path.join(ROOT_DIR, files[4]))
y_val = np.load(os.path.join(ROOT_DIR, files[5]))

# reset_graph()
# X_test_ph =  tf.placeholder(X_test.dtype, X_test.shape)
# X_train_ph = tf.placeholder(X_train.dtype, X_train.shape)
# X_val_ph = tf.placeholder(X_val.dtype, X_val.shape)
# y_test_ph = tf.placeholder(y_test.dtype, y_test.shape)
# y_train_ph = tf.placeholder(y_train.dtype, y_train.shape)
# y_val_ph = tf.placeholder(y_val.dtype, y_val.shape)

# def create_dataset_obj(X, y):
#     dataset = tf.data.Dataset.from_tensor_slices((X, y))
#     return dataset

# tr_dataset = create_dataset_obj(X_train_ph, 
#                                 X_train_ph)
# val_dataset = create_dataset_obj(X_val_ph, 
#                                  y_val_ph)
# test_dataset = create_dataset_obj(X_test_ph, 
#                                   y_test_ph)
print("done")

X_test.npy
X_train.npy
X_val.npy
y_test.npy
y_train.npy
y_val.npy
done


In [5]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [6]:
def numpy_to_tfrecords(features, lables, setType):
    tfrecords_file_name = str(setType) + '.tfrecords'
    writer = tf.python_io.TFRecordWriter(os.path.join(FINAL_DIR, tfrecords_file_name))
    
    labelName = str(setType) + '/label'
    featureName = str(setType) + '/image'
    
    # TODO: assert same length
    for i in range(len(features)):
        label = lables[i]
        img = features[i]
    
        # create features
        feature = {labelName: _int64_feature(label),
                   featureName: _bytes_feature(tf.compat.as_bytes(img.tostring()))}
        
        # create example protocol buffer
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        
        writer.write(example.SerializeToString())
        
        if i % 25 == 0:
            print("{} {} written".format(i, setType))
        
    writer.close()
    sys.stdout.flush()
    print("done")

In [7]:
numpy_to_tfrecords(X_test, y_test, "test")

0 test written
25 test written
50 test written
75 test written
100 test written
125 test written
150 test written
175 test written
200 test written
225 test written
250 test written
275 test written
300 test written
325 test written
done


In [8]:
numpy_to_tfrecords(X_val, y_val, "val")

0 val written
25 val written
50 val written
75 val written
100 val written
125 val written
150 val written
175 val written
200 val written
225 val written
250 val written
done


In [9]:
numpy_to_tfrecords(X_train, y_train, "train")

0 train written
25 train written
50 train written
75 train written
100 train written
125 train written
150 train written
175 train written
200 train written
225 train written
250 train written
275 train written
300 train written
325 train written
350 train written
375 train written
400 train written
425 train written
450 train written
475 train written
500 train written
525 train written
550 train written
575 train written
600 train written
625 train written
650 train written
675 train written
700 train written
725 train written
750 train written
775 train written
800 train written
825 train written
850 train written
875 train written
900 train written
925 train written
950 train written
975 train written
1000 train written
1025 train written
1050 train written
1075 train written
done


## Reading the files

# NOTE!
This is likely the *wrong* way to approach this problem.. I'm currently trying to find a better way to appraoch reading the tf records/having a reusable `_parse_function` that does not use a GLOBAL var.

In [10]:
GLOBAL_SET_TYPE = None

def _parse_function(example_proto):
    global GLOBAL_SET_TYPE
    labelName = str(GLOBAL_SET_TYPE) + '/label'
    featureName = str(GLOBAL_SET_TYPE) + '/image'
    feature = {featureName: tf.FixedLenFeature([], tf.string),
               labelName: tf.FixedLenFeature([], tf.int64)}
    
    # decode
    parsed_features = tf.parse_single_example(example_proto, features=feature)
    
    # convert image data from string to number
    image = tf.decode_raw(parsed_features[featureName], tf.float32)
    image = tf.reshape(image, [224, 224, 3])
    label = tf.cast(parsed_features[labelName], tf.int64)
    
    # [do any preprocessing here]
    
    return image, label

In [11]:
def return_batched_iter(setType, data_params, sess):
    global GLOBAL_SET_TYPE
    GLOBAL_SET_TYPE = setType
    
    filenames_ph = tf.placeholder(tf.string, shape=[None])

    dataset = tf.data.TFRecordDataset(filenames_ph)
    dataset = dataset.map(_parse_function)  # Parse the record into tensors.
    dataset = dataset.shuffle(buffer_size=data_params['buffer_size'])
    dataset = dataset.batch(data_params['batch_size'])
    dataset = dataset.repeat(data_params['n_epochs'])
    
    iterator = dataset.make_initializable_iterator()
    
    tfrecords_file_name = str(GLOBAL_SET_TYPE) + '.tfrecords'
    tfrecord_file_path = os.path.join(FINAL_DIR, tfrecords_file_name)
    
    # initialize
    sess.run(iterator.initializer, feed_dict={filenames_ph: [tfrecord_file_path]})
    
    return iterator

In [12]:
with tf.Session() as sess:
    data_params = {}
    data_params['n_epochs'] = 5
    data_params['batch_size'] = 16
    data_params['buffer_size'] = 128
    
    # training
    tr_iter = return_batched_iter('train', data_params, sess)
    next_tr_element = tr_iter.get_next()
    
    # validation
    val_iter = return_batched_iter('val', data_params, sess)
    next_val_element = val_iter.get_next()
    
    for e in range(data_params['n_epochs']):
        print("e: {}".format(e))
        
        # training
        while True:
            try:
                _ = sess.run(next_tr_element)
            except tf.errors.OutOfRangeError:
                break

        # validation (after training on entire training set, in this case)
        while True:
            try:
                _ = sess.run(next_val_element)
            except tf.errors.OutOfRangeError:
                break
    
    print("done with \'training\'")

e: 0
e: 1
e: 2
e: 3
e: 4
done with 'training'


In [13]:
with tf.Session() as sess:
    data_params = {}
    data_params['n_epochs'] = 1
    data_params['batch_size'] = 1
    data_params['buffer_size'] = 1 # no shuffling
    
    test_iter = return_batched_iter('test', data_params, sess)
    next_test_element = test_iter.get_next()
    
    i = 0
    while True:
        try:
            i += 1
            print("i: {} => {}".format(i, sess.run(next_test_element)[1]))
        except tf.errors.OutOfRangeError:
            break

i: 1 => [1]
i: 2 => [0]
i: 3 => [1]
i: 4 => [1]
i: 5 => [1]
i: 6 => [1]
i: 7 => [1]
i: 8 => [1]
i: 9 => [0]
i: 10 => [0]
i: 11 => [1]
i: 12 => [0]
i: 13 => [0]
i: 14 => [0]
i: 15 => [0]
i: 16 => [0]
i: 17 => [0]
i: 18 => [0]
i: 19 => [1]
i: 20 => [0]
i: 21 => [0]
i: 22 => [1]
i: 23 => [1]
i: 24 => [0]
i: 25 => [0]
i: 26 => [0]
i: 27 => [0]
i: 28 => [0]
i: 29 => [1]
i: 30 => [1]
i: 31 => [0]
i: 32 => [0]
i: 33 => [1]
i: 34 => [0]
i: 35 => [1]
i: 36 => [0]
i: 37 => [1]
i: 38 => [1]
i: 39 => [0]
i: 40 => [1]
i: 41 => [0]
i: 42 => [0]
i: 43 => [0]
i: 44 => [1]
i: 45 => [0]
i: 46 => [1]
i: 47 => [1]
i: 48 => [1]
i: 49 => [0]
i: 50 => [0]
i: 51 => [0]
i: 52 => [1]
i: 53 => [0]
i: 54 => [0]
i: 55 => [0]
i: 56 => [0]
i: 57 => [0]
i: 58 => [0]
i: 59 => [0]
i: 60 => [0]
i: 61 => [1]
i: 62 => [1]
i: 63 => [0]
i: 64 => [1]
i: 65 => [1]
i: 66 => [1]
i: 67 => [1]
i: 68 => [1]
i: 69 => [0]
i: 70 => [1]
i: 71 => [0]
i: 72 => [0]
i: 73 => [1]
i: 74 => [1]
i: 75 => [0]
i: 76 => [0]
i: 77 => [0]
i: 78 =>