# The Official MNIST Example from TensorFlow

In [None]:
import argparse
import sys

from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf

FLAGS = None


def main(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

    # Define regression model. At this point, it's linear regression.
    # You may call this the computational `Graph`.
    # placeholder is a box we can put data into
    # variable is something TensorFlow must change during optimization
    x = tf.placeholder(tf.float32, [None, 784])
    W = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    y = tf.matmul(x, W) + b

    # Places where we store labels
    y_ = tf.placeholder(tf.float32, [None, 10])

    # Define cross entropy function
    # tf.nn is Tensorflow neural network support module
    # reduce_mean is a built-in math operators, which computes means of a tensor
    cross_entropy = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    
    # We set up a Gradient Descent Optimizer. 
    # This is an operation that we can execute later.
    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
    
    # We have defined our tensors and operations. 
    # We need to create session to evaluate those variables and ops.
    sess = tf.InteractiveSession()
    
    # Here we call global variables initializer ops, and run it.
    tf.global_variables_initializer().run()
    
    # Training
    # In each iteration, we call training data in batch
    # We ask session to run ops `train_step` with `batch_xs` in the `x` PlaceHolder
    # and `batch_ys` in the `y_` PlaceHolder
    for _ in range(1000):
        batch_xs, batch_ys = mnist.train.next_batch(100)
        sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

    # Testing
    # Now we define a new operation in our graph, called `correct_prediction`,
    # which compares between actual labels and predictions.
    # tf.argmax gives you the label of the highest value in the tensor
    # Then, we define another operation called `accuracy` to compute average percent accuracy.
    # We let session run graph, with our test data.
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                      y_: mnist.test.labels}))

if __name__ == '__main__':
    # FLAGS is Google's argparse interface
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
                      help='Directory for storing input data')
    FLAGS, unparsed = parser.parse_known_args()
    
    # this make sure that the main function is run on command line 
    # with cmd arguments from parser
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

# Exercise

Let's try something fun.

1. Add a piece of code to compute the accuracy of our model when used with the training data, to see if our datasets are overfitted or not.
2. If we train the network for 5000 or 10000 iterations, instead of 1000, what happen? Does the network perform better? Is the network more overfitted?
3. If we change the optimizer to AdagradOptimizer, what happen?