In [46]:
import tensorflow as tf
import numpy as np 
from sklearn.datasets import fetch_california_housing
from IPython.display import clear_output, Image, display, HTML
from scipy.spatial import distance
import math

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Implement Data Preprocess here ######
housing = fetch_california_housing()
print("Shape of dataset:", housing.data.shape)
print("Shape of label:", housing.target.shape)

# Split training and test set
training_X = housing.data[:int(housing.data.shape[0]*0.9)]
training_y = housing.target[:int(housing.target.shape[0]*0.9)]
testing_X = housing.data[int(housing.data.shape[0]*0.9):]
testing_y = housing.target[int(housing.target.shape[0]*0.9):]

# Preprocessing: Distances of the houses to the center of them
center_training = np.reshape(np.asarray([np.mean(training_X[:,6]), np.mean(training_X[:,7])]), (1,2))
center_testing = np.reshape(np.asarray([np.mean(testing_X[:,6]), np.mean(testing_X[:,7])]), (1,2))
training_X = np.concatenate((training_X, np.transpose(distance.cdist(center_training ,training_X[:,6:8]))), axis = 1)
testing_X = np.concatenate((testing_X, np.transpose(distance.cdist(center_testing ,testing_X[:,6:8]))), axis = 1)

# Padding ones for the bias
training_X = np.concatenate((training_X, np.ones((training_X.shape[0], 1))), axis = 1)
testing_X = np.concatenate((testing_X, np.ones((testing_X.shape[0], 1))), axis = 1)

# Reshape y
training_y = np.reshape(training_y, (training_y.shape[0], 1))
testing_y = np.reshape(testing_y, (testing_y.shape[0], 1))

###### Implement Data Preprocess here ######

# Getting params
n_samples = training_X.shape[0]
n_dim = training_X.shape[1]

# Reset graph for each run of the code
tf.reset_default_graph()

# Declare input placeholders
X_train = tf.placeholder(tf.float64, [None, n_dim], name="X_train")
y_train = tf.placeholder(tf.float64, [None, 1], name="y_train")

X_test = tf.placeholder(tf.float64, [None, n_dim], name="X_test")
y_test = tf.placeholder(tf.float64, [None, 1], name="y_test")


# Calculating the weight using only training set
w = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(tf.matrix_transpose(X_train), X_train, name='XtX')), tf.matrix_transpose(X_train)), y_train, name='w')

# Prediction result
y_hat_train = tf.matmul(X_train, w, name='y_hat')
y_hat_test = tf.matmul(X_test, w, name='y_hat')

# Error rate
error_rate_train = tf.reduce_mean(tf.abs(y_hat_train - y_train)/y_train)
error_rate_test = tf.reduce_mean(tf.abs(y_hat_test - y_test)/y_test)

init = tf.global_variables_initializer()


###### Start TF session ######
with tf.Session() as sess:
    
    sess.run(init)
    
    show_graph(tf.get_default_graph().as_graph_def())
    
    # Compute w
    w_res, e_train, e_test = sess.run([w, error_rate_train, error_rate_test], feed_dict = {X_train:training_X, y_train: training_y, X_test: testing_X, y_test: testing_y})
    
    # Compute error rates on training and testing sets
    print('Result weights:', w_res)
    print('Error rate in training: ', e_train)
    print('Error rate in testing: ', e_test)

###### Start TF session ######

Shape of dataset: (20640, 8)
Shape of label: (20640,)


Result weights: [[  4.42512311e-01]
 [  9.98963619e-03]
 [ -1.14150750e-01]
 [  6.81827449e-01]
 [ -8.02837235e-07]
 [ -5.56203579e-03]
 [ -4.41519710e-01]
 [ -4.48136602e-01]
 [  2.79209185e-02]
 [ -3.79591098e+01]]
Error rate in training:  0.314927762991
Error rate in testing:  0.336110580797


# Graph Explanation

In this graph, we used 2 sets of data/label input placeholders to separate the error rate of training and testing sets.

(If we use the same input placeholders and run the same linear regression model, the weights will be modified in the testing set.)

Before the data is fed into the model, we first add another dimension to the features which contains the constant 1. By this way, we can have an extra dimension in the weight that is always multiplied by 1 and can be treated as the **bias**.

The model is constructed based on this formula:

$\mathbf{w} = (\mathbf{X}^\top\mathbf{X})^{-1}\mathbf{X}^\top\mathbf{y}$

The left part of the graph is the training part of the model. **X_train** is transposed and multiplied by **X_train** itself. The result is inversed and multiplied by first the inverted **X_train** then **y_train**. The result is the weights of each features that we desire: **w**.

Wtih **w** calculated, we can use it to do the linear regression by multiplying it to **X_train** (**X_test** in the testing part) and get the predicted values **y_hat**.

The error rate of the regression is calcuated with the formula:

$error = reduce\_mean(|\mathbf{\hat{y}}-\mathbf{y}|/\mathbf{y})$

In [2]:
housing.data[0]

array([   8.3252    ,   41.        ,    6.98412698,    1.02380952,
        322.        ,    2.55555556,   37.88      , -122.23      ])

In [3]:
housing.target[0]

4.5259999999999998

In [4]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [13]:
n_samples

18576

In [10]:
training_X[10]

array([   3.2031    ,   52.        ,    5.47761194,    1.07960199,
        910.        ,    2.26368159,   37.85      , -122.26      ,    1.        ])

In [5]:
training_X.shape

(18576, 9)

In [46]:
testing_y.shape

(2064, 8)

In [20]:
training_X[:,:6]

array([[  8.32520000e+00,   4.10000000e+01,   6.98412698e+00,
          1.02380952e+00,   3.22000000e+02,   2.55555556e+00],
       [  8.30140000e+00,   2.10000000e+01,   6.23813708e+00,
          9.71880492e-01,   2.40100000e+03,   2.10984183e+00],
       [  7.25740000e+00,   5.20000000e+01,   8.28813559e+00,
          1.07344633e+00,   4.96000000e+02,   2.80225989e+00],
       ..., 
       [  1.96090000e+00,   2.30000000e+01,   3.39361702e+00,
          1.16223404e+00,   1.35900000e+03,   3.61436170e+00],
       [  2.55990000e+00,   4.20000000e+01,   2.82644628e+00,
          9.66942149e-01,   2.31200000e+03,   4.77685950e+00],
       [  2.83480000e+00,   8.00000000e+00,   3.78133705e+00,
          1.04456825e+00,   2.58000000e+03,   3.59331476e+00]])

In [23]:
np.linalg.norm(training_X[:,6:8]-center_training)

390.82799553293393

In [17]:
center_training

array([  35.43037145, -119.40900624])

In [19]:
center_testing

array([  37.44527132, -121.01598837])

In [40]:
from scipy.spatial import distance
np.transpose(distance.cdist(np.reshape(center_training, (1,2)),training_X[:,6:8])).shape

(18576, 1)

In [39]:
training_X.shape

(18576, 9)