Dropout is arguably the most popular regularization technique in deep learning. Let's check again how it work.

In [2]:
__author__ = "kyubyong"
__address__ = "https://github.com/kyubyong/nlp_made_easy"
__email__ = "kbpark.linguist@gmail.com"

In [10]:
import numpy as np
import tensorflow as tf

In [6]:
tf.__version__

'1.5.0'

In [7]:
class Graph:
    def __init__(self, keep_prob=1.):
        # Inputs
        x = tf.expand_dims(tf.convert_to_tensor([1.], tf.float32), 1)
        y = tf.expand_dims(tf.convert_to_tensor([2.], tf.float32), 1)

        # Variables
        w1 = tf.Variable([[0.1, -0.1, 0.2]], dtype=tf.float32, name="weight1")

        # fully connected layer (a.k.a. dense layer)
        h = tf.nn.relu(tf.matmul(x, w1))
        self.h = tf.nn.dropout(h, keep_prob=keep_prob)

        # Readout layer
        w2 = tf.Variable([[0.2], [0.1], [-0.1]], dtype=tf.float32, name="weight2")
        self.pred = tf.matmul(self.h, w2)

        # Loss
        self.loss = tf.reduce_mean(tf.square(self.pred - y)) # L2 loss

        # Training scheme
        optimizer = tf.train.GradientDescentOptimizer(0.001)
        self.grads_and_vars = optimizer.compute_gradients(self.loss)
        self.train_op = optimizer.apply_gradients(self.grads_and_vars)



In [8]:
def run(keep_prob=1.):
    g = Graph(keep_prob=keep_prob)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        # feed-forward and back-prop for getting gradients
        loss, hidden_units, output, _, _grads_and_vars = sess.run([g.loss, g.h, g.pred, g.train_op, g.grads_and_vars])
        grad1 = _grads_and_vars[0][0]
        grad2 = _grads_and_vars[1][0]
                              
    return loss, hidden_units, output, grad1, grad2


In [9]:
loss, hidden_units, output, grad1, grad2 = run()

### Results of no dropouts

In [18]:
print("loss=", loss)
print("hidden units=", hidden_units)
print("y_hat=", output)
print("grad1=", grad1)
print("grad2=", grad2)

loss= 4.0
hidden units= [[0.1 0.  0.2]]
y_hat= [[0.]]
grad1= [[-0.8  0.   0.4]]
grad2= [[-0.4]
 [ 0. ]
 [-0.8]]


<img src="no-dropout.png" width=500. align="left">

Gradients flow back through the first and third units.

### Results of dropouts (50:50 prob.)

In [24]:
tf.reset_default_graph()
_loss, _hidden_units, _output, _grad1, _grad2 = run(keep_prob=.5)

In [25]:
print("loss=", _loss)
print("hidden units=", _hidden_units)
print("y_hat=", _output)
print("grad1=", _grad1)
print("grad2=", _grad2)

loss= 3.8416002
hidden units= [[0.2 0.  0. ]]
y_hat= [[0.04]]
grad1= [[-1.57  0.    0.  ]]
grad2= [[-0.78]
 [ 0.  ]
 [ 0.  ]]


<img src="dropout.png" width=500. align="left">

The third unit becomes zero, so the gradient flows back through only the first unit.