In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# [MNIST Dataset Source](http://yann.lecun.com/exdb/mnist/)
# [I am following this YouTube tutorial](https://www.youtube.com/watch?v=2FmcHiLCwTU&vl=en)

### Goal 
* Build a classifier that can look at a 28x28 image of a handwritten digit and classify the digit (0-9).
  * the "Hello World" of Deep Learning
* Personal goals: 
  * understand Tensorflow's python wrapper & Tensorflow a little bit better
  * understand neural networks a little bit better
  * understand some basics of machine learning a little bit better

### Tools
* Tensorflow
* Python

In [None]:
# import input_data # standard python class for downloading datasets
# read MNIST data
# https://stackoverflow.com/a/37540230/5411712
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_Data", one_hot=True)
print(mnist)

# This is what the data looks like:
![mnist_data](https://i.imgur.com/mKstG9R.png)

In [None]:
import tensorflow as tf

# set "hyperparameters" (knobs & dials)

In [None]:
learning_rate = 0.01 # how fast to update weights; 0.01 is standard and pretty good
        # too big >> miss optimal soln; too small >> takes too long to find optimal soln
training_iteration = 30 # number of times to run the gradient descent (optimizer) step
batch_size = 100
display_step = 2

## Learning Rate:
![learning_rate](https://i.imgur.com/3L1qbdT.png)

## notes
#### tensorflow
* Tensorflow "model" = "data flow graph"
* Graph has nodes called "operations"
  * basic units of math (e.g: addition, multiplication, fancy-schmancy-multivar-calculus, etc)
  * input: tensor
  * output: tensor
* tensor = multidimensional arrays (matrices)

#### conventions
* x = feature vector / the thing(s) that help us do the prediction
* y = "output classes" / the thing we want to predict
* "**placeholder**" = a variable that will have data assigned to it later

In [None]:
# TF graph input
x = tf.placeholder("float", [None, 784]) # mnist data image of shape; 28*28=784
     # notice images are 28px by 28px arrays & get "flattened" into 1D array of 784 pixels
y = tf.placeholder("float", [None, 10]) # 0-9 digits recognition >> 10 classes to be "classified"

# create a model

# set model parameters
W = tf.Variable(tf.zeros([784, 10])) # weights (probabilities that affect how data flows in graph)
b = tf.Variable(tf.zeros([10]))      # biases (lets us shift the regression line to fit data)

# An image is represented as a matrix of pixel values:
![Imgur](https://i.imgur.com/XYyI1ha.png)

# It gets flattened into a 1D array to be used as the feature vector:
![Imgur](https://i.imgur.com/d9ZvYPV.png)

In [None]:
# "scopes help us organize nodes in the graph visualizer called, Tensorboard"
with tf.name_scope("Wx_b") as scope:
    # First scope constructs a linear model (Logistic Regression)
    # `tf.nn` --- https://www.tensorflow.org/api_docs/python/tf/nn
    model = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax???? what about ReLU? Sigmoid? 
                                               # tf.nn.relu(biases=,features=,name=,weights=,x=)
                                               # tf.nn.softmax(_sentinel=,axis=,dim=,labels=,logits=,name=)

# Logistic Regression:
![Imgur](https://i.imgur.com/rrkOONc.png)

In [None]:
# Add summary operations to collect data
# helps us later visualize the distribution of the Weights and biases
# https://github.com/tensorflow/serving/issues/270
w_h = tf.summary.histogram("weights", W)
b_h = tf.summary.histogram("biases", b)

In [None]:
# More name scopes will clean up graph representation
with tf.name_scope("cost_function") as scope:
    # Second scope minimizes error using "cross entropy function" as the "cost function"
    # cross entropy function
    cost_function = -tf.reduce_sum(y*tf.log(model))
    # create a summary to monitor the cost function; for later visualization
    tf.summary.scalar("cost_function", cost_function)

In [None]:
with tf.name_scope("train") as scope:
    # Last scope Gradient Descent; the training algorithm
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)

# Gradient Descent:
![Imgur](https://i.imgur.com/i6WW4gH.png)

In [None]:
# initialize the variables
init = tf.initialize_all_variables()

In [None]:
# merge summaries into 1 operation
# https://github.com/tensorflow/tensorflow/issues/7737
merged_summary_op = tf.summary.merge_all()

In [None]:
print("learning_rate\t\t=\t" + str(learning_rate))
print("training_iteration\t=\t" + str(training_iteration))
print("batch_size\t\t=\t" + str(batch_size))
print("display_step\t\t=\t" + str(display_step))

# Training

In [None]:
# Start training by launching a session that executes the data flow graph
with tf.Session() as sess:
    sess.run(init)

    # Set the logs writer to the folder /tmp/tensorflow_logs
    # This is for all the visualizations later
    # https://stackoverflow.com/a/41483033/5411712
    summary_writer = tf.summary.FileWriter('./logs', graph_def=sess.graph_def)
    
    # Training cycle
    for i in range(training_iteration):
        avg_cost = 0.0 # prints out periodically to make sure model is "improving" ... goal is to minimize cost
        total_batch = int(mnist.train.num_examples / batch_size)
        # loop over all batches
        for b in range(total_batch): # for each example in training data
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # fit training using batch data
            # `optimizer` is Gradient Descent; used for 'backpropagation'
            sess.run(optimizer, feed_dict={x:batch_xs, y:batch_ys})
            # compute the average loss
            avg_cost += sess.run(cost_function, feed_dict={x:batch_xs, y:batch_ys})/total_batch
            # write logs for each iteration
            summary_str = sess.run(merged_summary_op, feed_dict={x:batch_xs, y:batch_ys})
            summary_writer.add_summary(summary_str, i * total_batch + b)
                                            # why `i * total_batch + b` ??? idk.
        # Display logs per iteration step
        if (i % display_step == 0):
            print("iteration:", '%04d' % (i+1), "avg_cost=", "{:9f}".format(avg_cost))

    print("\nTraining completed!\n")

    # Test the model
    # remember 'y' is the prediction variable
    predictions = tf.equal(tf.argmax(model, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(predictions, "float"))
    print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))

## Notice that the ```avg_cost``` values decrease with each logged iteration. This means that the gradient descent algorithm is minimizing the cost function. 
### I suppose if we ran the code with ```training_iteration``` set to a larger number then we would expect to see little to no improvement on the accuracy since the ```avg_cost``` seems to level off at around 18.

# Viewing all the summaries in ***Tensorboard***
##### this should be done locally so make sure to **download** the ```kernal.ipynb``` file and run ```tensorboard --logdir=./logs``` in the command line 
* ***Note:*** ```pip install tensorflow``` may be required to import tensorflow

#### main graph
<img src="https://i.imgur.com/f8LgApJ.png" width="400">
#### tensorboard_auxilary_nodes
![tensorboard_auxilary_nodes](https://i.imgur.com/ZABjzeR.png)
#### tensorboard_cost_function
![tensorboard_cost_function](https://i.imgur.com/yTCklib.png)
#### tensorboard_biases_distribution
![tensorboard_biases_distribution](https://i.imgur.com/iyZupnI.png)
#### tensorboard_weights_distribution
![tensorboard_weights_distribution](https://i.imgur.com/DxgMGZt.png)
#### tensorboard_biases_histogram
![tensorboard_biases_histogram](https://i.imgur.com/Af06kgc.png)
#### tensorboard_weights_histogram
![tensorboard_weights_histogram](https://i.imgur.com/wcbcIdy.png)

In [None]:
# optionally run the command in the notebook itself by uncommenting the line below
#!tensorboard --logdir=./logs

# Future Learning
* What is PyTorch and how does it compare to Tensorflow?
   * https://www.youtube.com/watch?v=nbJ-2G2GXL0
     * Would PyTorch reduce the need to define "placeholders" because that was, frankly, weird to see in a language like python?
* [But what *is* a Neural Network?](https://youtu.be/aircAruvnKk)
  * [and how do they learn?](https://youtu.be/IHZwWFHWa-w) 
  * [and what is backprop really doing?](https://youtu.be/Ilg3gGewQ5U)
  * [and how does backprop use calculus?](https://youtu.be/tIeHLnjs5U8)
* How can the accuracy found above ```0.9254``` be improved to closer to ```0.95``` or ```0.99```?
   * To what extend does changing the ```learning_rate``` or ```training_iteration``` or ```batch_size``` affect the accuracy? 
     * I dont think batch_size should have any affect. 
     * with ```training_iteration=30``` and ```learning_rate=0.01``` the algorithm ran in less than a few minutes and achieved ```0.9254```. Perhaps allowing it to train for several hours would boost the accuracy?
       * [relevant quora question](http://qr.ae/TUGJid)
* How long would it take a human toddler to "classify" digits (0-9)? An hour or two? maybe less? Of course you would need to hold their attention to the task, haha! 
* What would happen to the accuracy if I modified the test data or the training data to include **random noise** or even attempt the [**one pixel attack**](https://arxiv.org/abs/1710.08864)
  * [video about one pixel attack](https://youtu.be/SA4YEAWVpbk)
>   "Now, note that this also means that we have to be able to look into the neural network and have access to the confidence values." - [Károly Zsolnai-Fehér](https://youtu.be/SA4YEAWVpbk?t=155)
  *  Would a method for reducing NerualNet accuracy that only sees output classes, *without accuracy values*, be analogous to humans discovering optical illusions? haha! 😂
  * [it seems some researchers have tried to trick AI that learned on MNIST data](https://arxiv.org/pdf/1801.02612.pdf)
  * [another one](https://arxiv.org/pdf/1608.04644.pdf)
  * [and another one](https://arxiv.org/pdf/1807.10335.pdf)
  * [and more](https://www.google.com/search?safe=off&q=one+pixel+attack+"mnist")
* What would happen to the accuracy if I changed **```tf.nn.softmax```** to **```relu```** or even **```sigmoid```** or **```tanh```**?
  * [learn more about activation functions](https://youtu.be/-7scQpJT7uo)