# Chapter 9 – Up and running with TensorFlow

In [7]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import tensorflow as tf

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "tensorflow"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Creating and running a graph

In [8]:
import tensorflow as tf

reset_graph()

x = tf.Variable(3, name="x")
y = tf.Variable(4, name="y")
f = x*x*y + y + 2

In [9]:
f

<tf.Tensor 'add_1:0' shape=() dtype=int32>

In [10]:
sess = tf.Session()
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)
print(result)

42


In [11]:
sess.close()

In [12]:
with tf.Session() as sess:
    x.initializer.run()
    y.initializer.run()
    result = f.eval()

In [13]:
result

42

In [14]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    result = f.eval()

In [15]:
result

42

In [16]:
init = tf.global_variables_initializer()

In [17]:
sess = tf.InteractiveSession()
init.run()
result = f.eval()
print(result)

42


# Managing graphs

In [19]:
reset_graph()

x1 = tf.Variable(1)
x1.graph is tf.get_default_graph()

True

In [22]:
graph = tf.Graph()
# temporarily creating a default graph
with graph.as_default():
    x2 = tf.Variable(2)

x2.graph is graph

True

In [23]:
x2.graph is tf.get_default_graph()

False

Note: When you evaluate a node, TensorFlow automatically determines the set of nodes that it depends on and it evaluates these nodes first.

# Linear Regression

## Using the Normal Equation

In [30]:
import numpy as np
from sklearn.datasets import fetch_california_housing

reset_graph()

"""
The fetch_california_housing function returns:
    -------
    dataset : dict-like object with the following attributes:
    dataset.data : ndarray, shape [20640, 8]
        Each row corresponding to the 8 feature values in order.
    dataset.target : numpy array of shape (20640,)
        Each value corresponds to the average house value in units of 100,000.
    dataset.feature_names : array of length 8
        Array of ordered feature names used in the dataset.
    dataset.DESCR : string
        Description of the California housing dataset.
"""
housing = fetch_california_housing()

m, n = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data]

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
    theta_value = theta.eval()

In [33]:
theta_value

array([[-3.7185181e+01],
       [ 4.3633747e-01],
       [ 9.3952334e-03],
       [-1.0711310e-01],
       [ 6.4479220e-01],
       [-4.0338000e-06],
       [-3.7813708e-03],
       [-4.2348403e-01],
       [-4.3721911e-01]], dtype=float32)

Compare with Scikit-Learn

In [35]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing.data, housing.target.reshape(-1, 1))

print(np.r_[lin_reg.intercept_.reshape(-1, 1), lin_reg.coef_.T])

[[-3.69419202e+01]
 [ 4.36693293e-01]
 [ 9.43577803e-03]
 [-1.07322041e-01]
 [ 6.45065694e-01]
 [-3.97638942e-06]
 [-3.78654265e-03]
 [-4.21314378e-01]
 [-4.34513755e-01]]


## Using Batch Gradient Descent

Gradient Descent requires scaling the feature vectors first. We could do this using TF, but let's just use Scikit-Learn for now.

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_housing_data = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m, 1)), scaled_housing_data]

In [38]:
print(scaled_housing_data_plus_bias.mean(axis=0))
print(scaled_housing_data_plus_bias.mean(axis=1))
print(scaled_housing_data_plus_bias.mean())
print(scaled_housing_data_plus_bias.shape)

[ 1.00000000e+00  6.60969987e-17  5.50808322e-18  6.60969987e-17
 -1.06030602e-16 -1.10161664e-17  3.44255201e-18 -1.07958431e-15
 -8.52651283e-15]
[ 0.38915536  0.36424355  0.5116157  ... -0.06612179 -0.06360587
  0.01359031]
0.11111111111111005
(20640, 9)


### Manually computing the gradients

In [59]:
reset_graph()

n_epochs = 2000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)

Epoch 0 MSE = 9.161543
Epoch 100 MSE = 0.71450067
Epoch 200 MSE = 0.5667049
Epoch 300 MSE = 0.5555719
Epoch 400 MSE = 0.5488112
Epoch 500 MSE = 0.5436362
Epoch 600 MSE = 0.5396294
Epoch 700 MSE = 0.53650916
Epoch 800 MSE = 0.5340678
Epoch 900 MSE = 0.5321474
Epoch 1000 MSE = 0.5306294
Epoch 1100 MSE = 0.5294236
Epoch 1200 MSE = 0.5284622
Epoch 1300 MSE = 0.5276914
Epoch 1400 MSE = 0.5270721
Epoch 1500 MSE = 0.5265715
Epoch 1600 MSE = 0.5261663
Epoch 1700 MSE = 0.5258372
Epoch 1800 MSE = 0.5255686
Epoch 1900 MSE = 0.5253492
Best theta:
[[ 2.0685523e+00]
 [ 8.5735834e-01]
 [ 1.2673114e-01]
 [-3.1273839e-01]
 [ 3.4245053e-01]
 [-1.9602410e-03]
 [-4.0592730e-02]
 [-8.1566954e-01]
 [-7.8930187e-01]]


### Using autodiff

Same as above except for the gradients = ... line:

In [53]:
reset_graph()

n_epochs = 2000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

In [54]:
"""
The gradients() function takes an op (in this case mse) and a list of variables (in this case just theta), and it 
creates a list of ops (one per variable) to compute the gradients of the op with regards to each variable.
"""
gradients = tf.gradients(mse, [theta])[0]

We can compare the training with gradients computed using autodiff vs the above manual computation

In [58]:
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)

Epoch 0 MSE = 9.161543
Epoch 100 MSE = 0.7145006
Epoch 200 MSE = 0.566705
Epoch 300 MSE = 0.5555719
Epoch 400 MSE = 0.5488112
Epoch 500 MSE = 0.5436362
Epoch 600 MSE = 0.5396294
Epoch 700 MSE = 0.5365092
Epoch 800 MSE = 0.5340678
Epoch 900 MSE = 0.5321474
Epoch 1000 MSE = 0.53062946
Epoch 1100 MSE = 0.5294236
Epoch 1200 MSE = 0.5284622
Epoch 1300 MSE = 0.5276914
Epoch 1400 MSE = 0.5270721
Epoch 1500 MSE = 0.5265715
Epoch 1600 MSE = 0.5261664
Epoch 1700 MSE = 0.5258372
Epoch 1800 MSE = 0.52556866
Epoch 1900 MSE = 0.5253492
Best theta:
[[ 2.0685525e+00]
 [ 8.5735834e-01]
 [ 1.2673113e-01]
 [-3.1273845e-01]
 [ 3.4245059e-01]
 [-1.9602366e-03]
 [-4.0592730e-02]
 [-8.1566942e-01]
 [-7.8930181e-01]]


More on audodiff: How could you find the partial derivatives of the following function with regards to a and b?

In [60]:
def my_func(a, b):
    z = 0
    for i in range(100):
        z = a * np.cos(z + i) + z * np.sin(b - i)
    return z

In [61]:
my_func(0.2, 0.3)

-0.21253923284754916

In [63]:
reset_graph()

a = tf.Variable(0.2, name="a")
b = tf.Variable(0.3, name="b")
z = tf.constant(0.0, name="z0")
for i in range(100):
    z = a * tf.cos(z + i) + z * tf.sin(b - i)

grads = tf.gradients(z, [a, b])
init = tf.global_variables_initializer()

In [64]:
with tf.Session() as sess:
    init.run()
    print(z.eval())
    print(sess.run(grads))

-0.21253741
[-1.1388494, 0.19671395]


### Using a GradientDescentOptimizer

TensowFlow also provides a number of optimizers out of the box, including a Graidient Descent optimizer. You can simply replace the preceding `graidents = ...` and `raining_op = ...` lines with the following code:
```
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)
```

In [68]:
reset_graph()

n_epochs = 2000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

In [69]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

In [72]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)

Epoch 0 MSE = 9.161543
Epoch 100 MSE = 0.7145006
Epoch 200 MSE = 0.566705
Epoch 300 MSE = 0.5555719
Epoch 400 MSE = 0.5488112
Epoch 500 MSE = 0.5436362
Epoch 600 MSE = 0.5396294
Epoch 700 MSE = 0.5365092
Epoch 800 MSE = 0.5340678
Epoch 900 MSE = 0.5321474
Epoch 1000 MSE = 0.53062946
Epoch 1100 MSE = 0.5294236
Epoch 1200 MSE = 0.5284622
Epoch 1300 MSE = 0.5276914
Epoch 1400 MSE = 0.5270721
Epoch 1500 MSE = 0.5265715
Epoch 1600 MSE = 0.5261664
Epoch 1700 MSE = 0.5258372
Epoch 1800 MSE = 0.52556866
Epoch 1900 MSE = 0.5253492
Best theta:
[[ 2.0685525e+00]
 [ 8.5735834e-01]
 [ 1.2673113e-01]
 [-3.1273845e-01]
 [ 3.4245059e-01]
 [-1.9602366e-03]
 [-4.0592730e-02]
 [-8.1566942e-01]
 [-7.8930181e-01]]


### Using a momentum optimizer

If you want to use a different type of optimizer, you just need to change one line, For example, you can use a momentum optimizer (which converges much faster than Graident Descent):
```
optimizer = tf.train.MomentumOptimizer(leraning_rate=learning_rate, momentum=0.9)
```

In [79]:
reset_graph()

n_epochs = 2000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

In [80]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=0.9)
training_op = optimizer.minimize(mse)

In [81]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)

Epoch 0 MSE = 9.161543
Epoch 100 MSE = 0.53056407
Epoch 200 MSE = 0.5250113
Epoch 300 MSE = 0.52441096
Epoch 400 MSE = 0.52433306
Epoch 500 MSE = 0.52432257
Epoch 600 MSE = 0.52432126
Epoch 700 MSE = 0.52432096
Epoch 800 MSE = 0.52432096
Epoch 900 MSE = 0.52432096
Epoch 1000 MSE = 0.52432096
Epoch 1100 MSE = 0.52432084
Epoch 1200 MSE = 0.52432096
Epoch 1300 MSE = 0.5243209
Epoch 1400 MSE = 0.5243209
Epoch 1500 MSE = 0.5243209
Epoch 1600 MSE = 0.5243209
Epoch 1700 MSE = 0.5243209
Epoch 1800 MSE = 0.5243209
Epoch 1900 MSE = 0.5243209
Best theta:
[[ 2.068558  ]
 [ 0.82962054]
 [ 0.11875187]
 [-0.26552895]
 [ 0.30569792]
 [-0.00450293]
 [-0.03932633]
 [-0.8998828 ]
 [-0.8705383 ]]


#### Note: an optimizer is essentially a wrapper for gradient computation and model weights updating.

# Feeding data to the training algorithm

## Placeholder nodes

We now modify the previous code to implement Mini-batch Gradient Descent. For this, we need a way to replace `X` and `y` at every iteration with the next mini-batch. The simplest way to do this is to use placeholder nodes, These nodes are special because they do not perform any computation, they just output the data you tell them to output at runtime. They are typically used to pass the training data to TensorFlow during training. If you don't specify a value at runtime for a placehoder, you get an exception.

In [84]:
reset_graph()

A = tf.placeholder(tf.float32, shape=(None, 3)) # A is a 2D tensor with 3 columns but arbitraily number of rows
B = A + 5
with tf.Session() as sess:
    B_val_1 = B.eval(feed_dict={A: [[1, 2, 3]]})
    B_val_2 = B.eval(feed_dict={A: [[4, 5, 6], [7, 8, 9]]})

print(B_val_1)

[[6. 7. 8.]]


In [85]:
print(B_val_2)

[[ 9. 10. 11.]
 [12. 13. 14.]]


## Mini-batch Gradient Descent

In [100]:
reset_graph()

n_epochs = 2000
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=(None, n + 1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [101]:
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

In [102]:
def fetch_batch(epoch, batch_index, batch_size):
    np.random.seed(epoch * n_batches + batch_index)  # not shown in the book
    indices = np.random.randint(m, size=batch_size)  # not shown
    X_batch = scaled_housing_data_plus_bias[indices] # not shown
    y_batch = housing.target.reshape(-1, 1)[indices] # not shown
    return X_batch, y_batch

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval(feed_dict={X: X_batch, y: y_batch}))
            
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        
    best_theta = theta.eval()
    
print("Best theta:")
print(best_theta)

Epoch 0 MSE = 7.6678786
Epoch 100 MSE = 0.41995794
Epoch 200 MSE = 0.5047168
Epoch 300 MSE = 0.37059167
Epoch 400 MSE = 0.4730612
Epoch 500 MSE = 0.4555484
Epoch 600 MSE = 0.45037347
Epoch 700 MSE = 0.51029336
Epoch 800 MSE = 0.5903121
Epoch 900 MSE = 0.8080311
Epoch 1000 MSE = 0.4821706
Epoch 1100 MSE = 0.3496415
Epoch 1200 MSE = 0.6035093
Epoch 1300 MSE = 0.39144334
Epoch 1400 MSE = 0.45811677
Epoch 1500 MSE = 0.39785823
Epoch 1600 MSE = 0.6045395
Epoch 1700 MSE = 0.3359803
Epoch 1800 MSE = 0.39687693
Epoch 1900 MSE = 0.48503765
Best theta:
[[ 2.049128  ]
 [ 0.82035035]
 [ 0.11519123]
 [-0.2725699 ]
 [ 0.2420951 ]
 [-0.01339601]
 [-0.06191554]
 [-0.89969516]
 [-0.87075526]]


# Visualizing the graph

## Inside Jupyter

In [103]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = b"<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [104]:
show_graph(tf.get_default_graph())

## Using TensorBoard

In [106]:
reset_graph()

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [109]:
n_epochs = 2000
learning_rate = 0.01
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

X = tf.placeholder(tf.float32, shape=(None, n + 1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [111]:
mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [114]:
with tf.Session() as sess:
    sess.run(init) 

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval(feed_dict={X: X_batch, y: y_batch}))

        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X: X_batch, y: y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()

Epoch 0 MSE = 9.445263
Epoch 100 MSE = 0.41995794
Epoch 200 MSE = 0.5047168
Epoch 300 MSE = 0.37059167
Epoch 400 MSE = 0.4730612
Epoch 500 MSE = 0.4555484
Epoch 600 MSE = 0.45037347
Epoch 700 MSE = 0.51029336
Epoch 800 MSE = 0.5903121
Epoch 900 MSE = 0.8080311
Epoch 1000 MSE = 0.4821706
Epoch 1100 MSE = 0.3496415
Epoch 1200 MSE = 0.6035093
Epoch 1300 MSE = 0.39144334
Epoch 1400 MSE = 0.45811677
Epoch 1500 MSE = 0.39785823
Epoch 1600 MSE = 0.6045395
Epoch 1700 MSE = 0.3359803
Epoch 1800 MSE = 0.39687693
Epoch 1900 MSE = 0.48503765


In [115]:
file_writer.close()

# Name scopes

In [126]:
reset_graph()

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [128]:
n_epochs = 2000
learning_rate = 0.01
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

X = tf.placeholder(tf.float32, shape=(None, n + 1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")

# place error and mse into the loss scope
with tf.name_scope("loss") as scope:
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name="mse")

In [119]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [120]:
with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval(feed_dict={X: X_batch, y: y_batch}))

        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X: X_batch, y: y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()

file_writer.flush()
file_writer.close()
print("Best theta:")
print(best_theta)

Epoch 0 MSE = 9.445263
Epoch 100 MSE = 0.41995794
Epoch 200 MSE = 0.5047168
Epoch 300 MSE = 0.37059167
Epoch 400 MSE = 0.4730612
Epoch 500 MSE = 0.4555484
Epoch 600 MSE = 0.45037347
Epoch 700 MSE = 0.51029336
Epoch 800 MSE = 0.5903121
Epoch 900 MSE = 0.8080311
Epoch 1000 MSE = 0.4821706
Epoch 1100 MSE = 0.3496415
Epoch 1200 MSE = 0.6035093
Epoch 1300 MSE = 0.39144334
Epoch 1400 MSE = 0.45811677
Epoch 1500 MSE = 0.39785823
Epoch 1600 MSE = 0.6045395
Epoch 1700 MSE = 0.3359803
Epoch 1800 MSE = 0.39687693
Epoch 1900 MSE = 0.48503765
Best theta:
[[ 2.049128  ]
 [ 0.82035035]
 [ 0.11519123]
 [-0.2725699 ]
 [ 0.2420951 ]
 [-0.01339601]
 [-0.06191554]
 [-0.89969516]
 [-0.87075526]]


In [129]:
print(error.op.name)

loss/sub


In [130]:
print(mse.op.name)

loss/mse


# Modularity

An ugly flat code:

In [132]:
reset_graph()

n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")

w1 = tf.Variable(tf.random_normal((n_features, 1)), name="weights1")
w2 = tf.Variable(tf.random_normal((n_features, 1)), name="weights2")
b1 = tf.Variable(0.0, name="bias1")
b2 = tf.Variable(0.0, name="bias2")

z1 = tf.add(tf.matmul(X, w1), b1, name="z1")
z2 = tf.add(tf.matmul(X, w2), b2, name="z2")

relu1 = tf.maximum(z1, 0., name="relu1")
relu2 = tf.maximum(z1, 0., name="relu2")  # Oops, cut&paste error! Did you spot it?

output = tf.add(relu1, relu2, name="output")

Much better, using a function to build the ReLUs:

In [133]:
reset_graph()

def relu(X):
    w_shape = (int(X.get_shape()[1]), 1)
    w = tf.Variable(tf.random_normal(w_shape), name="weights")
    b = tf.Variable(0.0, name="bias")
    z = tf.add(tf.matmul(X, w), b, name="z")
    return tf.maximum(z, 0., name="relu")

n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X) for i in range(5)]

In [134]:
file_writer = tf.summary.FileWriter("logs/relu1", tf.get_default_graph())

Even better using name scopes:

In [135]:
reset_graph()

def relu(X):
    with tf.name_scope("relu"):
        w_shape = (int(X.get_shape()[1]), 1)                          # not shown in the book
        w = tf.Variable(tf.random_normal(w_shape), name="weights")    # not shown
        b = tf.Variable(0.0, name="bias")                             # not shown
        z = tf.add(tf.matmul(X, w), b, name="z")                      # not shown
        return tf.maximum(z, 0., name="max")         

In [136]:
n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name="output")

file_writer = tf.summary.FileWriter("logs/relu2", tf.get_default_graph())
file_writer.close()