In [1]:
import numpy as np
print("Loading training data ...")
data_train = np.genfromtxt("data/LSDA2017GalaxiesTrain.csv",delimiter=",", dtype=float)
Xtrain, ytrain = data_train[:,:-1], data_train[:,-1]
print("Loaded training data: n=%i, d=%i" % (Xtrain.shape[0], Xtrain.shape[1]))

data_validation = np.genfromtxt("data/LSDA2017GalaxiesValidate.csv",delimiter=",", dtype=float)
Xvd, yvd = data_validation[:,:-1], data_validation[:,-1]
print("Loaded validation data: n=%i, d=%i" % (Xvd.shape[0], Xvd.shape[1]))

data_test = np.genfromtxt("data/LSDA2017GalaxiesTest.csv",delimiter=",", dtype=float)
Xtest, ytest = data_test[:,:-1], data_test[:,-1]
print("Loaded test data: n=%i, d=%i" % (Xtest.shape[0], Xtest.shape[1]))

Loading training data ...
Loaded training data: n=4000, d=18
Loaded validation data: n=1000, d=18
Loaded test data: n=5000, d=18


In [2]:
ytrain_var = np.std(ytrain)**2
print("Variance for train label is %f" % ytrain_var)

def tf_input(X,y):
    # Preprocess your data here...

    #X = tf.constant(X)
    #y = tf.constant(y)
    return X, y

Variance for train label is 0.010498


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import shutil
import os

# Splits data into mini-batches
# Batches are not randomized/shuffled, shuffling the data in mini-batch learning typically improves the performance
class Batcher:
    'Splits data into mini-batches'
    def __init__(self, data, batchSize):
        self.data, self.target = data
        self.batchSize = batchSize
        self.batchStartIndex = 0
        self.batchStopIndex = 0
        self.noData = self.data.shape[0]
    def nextBatch(self):
        self.batchStartIndex = self.batchStopIndex % self.noData
        self.batchStopIndex = min(self.batchStartIndex + self.batchSize, self.noData)
        return self.data[self.batchStartIndex:self.batchStopIndex], self.target[self.batchStartIndex:self.batchStopIndex]

# Flags
try:
   FLAGS
except NameError:
    flags = tf.app.flags
    FLAGS = flags.FLAGS
    flags.DEFINE_string('summary_dir', 'tensor_logs', 'directory to put the summary data')
    flags.DEFINE_string('data_dir', 'data', 'directory with data')
    flags.DEFINE_integer('maxIter', 30000, 'number of iterations')
    flags.DEFINE_integer('batchSize', 128, 'batch size')
    flags.DEFINE_integer('noHidden1', 64, 'size of first hidden layer')
    flags.DEFINE_integer('noHidden2', 32, 'size of second hidden layer')
    flags.DEFINE_float('lr', 0.001, 'initial learning rate')
    # when we reload the flags, we clear the output dir
    if not os.path.exists(flags.FLAGS.summary_dir):
        os.makedirs(flags.FLAGS.summary_dir)
    else:
        shutil.rmtree(flags.FLAGS.summary_dir)
        os.makedirs(flags.FLAGS.summary_dir)


    
X_train ,y_train = tf_input(Xtrain,ytrain)
# Number of training data points
noTrain = X_train.shape[0]
print("Numer of training data points:", noTrain)

# Input dimension
inDim = X_train.shape[1]

X_vd,y_vd = tf_input(Xvd,yvd)
X_test ,y_test = tf_input(Xtest,ytest)

Numer of training data points: 4000


In [4]:
shutil.rmtree(flags.FLAGS.summary_dir)

# Create graph
sess = tf.Session()

# Initialize placeholders
x_data = tf.placeholder(shape=[None, inDim], dtype=tf.float32, name='input')
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='target')

# Define variables
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=np.std(ytrain)) # restrict to +/- 2*stddev
    return tf.Variable(initial, name='weights')

def bias_variable(shape):
    initial = tf.truncated_normal(shape, stddev=np.std(ytrain))
    return tf.Variable(initial, name='bias')


# Define model
with tf.name_scope('layer1') as scope:
    W_1 = weight_variable([inDim, FLAGS.noHidden1])
    b_1 = bias_variable([FLAGS.noHidden1])
    y_1 = tf.matmul(x_data , W_1 ) + b_1
    y_1 = tf.nn.relu(y_1)
    

with tf.name_scope('layer2') as scope:
    W_2 = weight_variable([FLAGS.noHidden1, 1])
    b_2 = bias_variable([1])
    model_output = tf.matmul(y_1, W_2) + b_2


# Declare loss function
loss = tf.reduce_mean(tf.square(model_output - y_target), name='mean_squared_error')
tf.summary.scalar('loss', loss)

# Declare optimizer
my_opt =  tf.train.AdamOptimizer(FLAGS.lr)
train_step = my_opt.minimize(loss)


# Logging
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train')
test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test')
validate_writer=  tf.summary.FileWriter(FLAGS.summary_dir + '/validate')
writer = tf.summary.FileWriter(FLAGS.summary_dir, sess.graph)
saver = tf.train.Saver() # for storing the best network



In [5]:
# Initialize variables
init = tf.global_variables_initializer()
sess.run(init)

# Best validation accuracy seen so far
bestValidation = np.inf

# Mini-batches for training
batcher = Batcher((X_train,y_train), FLAGS.batchSize)

# Training loop
for i in range(FLAGS.maxIter):
    xTrain, yTrain = batcher.nextBatch()
    sess.run(train_step, feed_dict={x_data: xTrain, y_target: np.transpose([yTrain])})
    summary = sess.run(merged, feed_dict={x_data: xTrain, y_target: np.transpose([yTrain])})
    train_writer.add_summary(summary, i)
    if((i+1)%100==0):
        print("Iteration:",i+1,"/",FLAGS.maxIter)
        #summary = sess.run(merged, feed_dict={x_data: X_test, y_target: np.transpose([y_test])})
        #test_writer.add_summary(summary, i)
        currentValidation, summary = sess.run([loss, merged], feed_dict={x_data: X_vd, y_target: np.transpose([y_vd])})
        validate_writer.add_summary(summary, i)
        if(currentValidation < bestValidation):
            bestValidation = currentValidation
            saver.save(sess=sess, save_path=FLAGS.summary_dir + '/bestNetwork')
            print("\tbetter network stored,",currentValidation,"<",bestValidation)

# Print values after last training step
print("final training accuracy:", sess.run(loss, feed_dict={x_data: X_train, y_target: np.transpose([y_train])}),
      "final test accuracy: ", sess.run(loss, feed_dict={x_data: X_test, y_target: np.transpose([y_test])}),
      "final validation accuracy: ", sess.run(loss, feed_dict={x_data: X_vd, y_target: np.transpose([y_vd])}))

# Load the network with the lowest validation error
saver.restore(sess=sess, save_path=FLAGS.summary_dir + '/bestNetwork')
print("best training accuracy:", sess.run(loss, feed_dict={x_data: X_train, y_target: np.transpose([y_train])}),
      "best test accuracy: ", sess.run(loss, feed_dict={x_data: X_test, y_target: np.transpose([y_test])}),
      "best validation accuracy: ", sess.run(loss, feed_dict={x_data: X_vd, y_target: np.transpose([y_vd])}))


Iteration: 100 / 30000
	better network stored, 0.00489184 < 0.00489184
Iteration: 200 / 30000
	better network stored, 0.00306679 < 0.00306679
Iteration: 300 / 30000
	better network stored, 0.00260233 < 0.00260233
Iteration: 400 / 30000
	better network stored, 0.00242511 < 0.00242511
Iteration: 500 / 30000
	better network stored, 0.00229551 < 0.00229551
Iteration: 600 / 30000
	better network stored, 0.00227807 < 0.00227807
Iteration: 700 / 30000
Iteration: 800 / 30000
	better network stored, 0.00211168 < 0.00211168
Iteration: 900 / 30000
	better network stored, 0.00167308 < 0.00167308
Iteration: 1000 / 30000
Iteration: 1100 / 30000
Iteration: 1200 / 30000
	better network stored, 0.00158813 < 0.00158813
Iteration: 1300 / 30000
	better network stored, 0.00151571 < 0.00151571
Iteration: 1400 / 30000
Iteration: 1500 / 30000
Iteration: 1600 / 30000
Iteration: 1700 / 30000
Iteration: 1800 / 30000
	better network stored, 0.0013398 < 0.0013398
Iteration: 1900 / 30000
Iteration: 2000 / 30000
Ite

Iteration: 28400 / 30000
Iteration: 28500 / 30000
Iteration: 28600 / 30000
Iteration: 28700 / 30000
Iteration: 28800 / 30000
Iteration: 28900 / 30000
Iteration: 29000 / 30000
Iteration: 29100 / 30000
Iteration: 29200 / 30000
Iteration: 29300 / 30000
Iteration: 29400 / 30000
Iteration: 29500 / 30000
Iteration: 29600 / 30000
Iteration: 29700 / 30000
Iteration: 29800 / 30000
Iteration: 29900 / 30000
Iteration: 30000 / 30000
final training accuracy: 0.000940901 final test accuracy:  0.00137959 final validation accuracy:  0.00115164
INFO:tensorflow:Restoring parameters from tensor_logs/bestNetwork
best training accuracy: 0.000932567 best test accuracy:  0.00140713 best validation accuracy:  0.000914765
