In [18]:
%matplotlib widget

from sklearn.datasets import load_diabetes
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

In [19]:
diabetes = load_diabetes(scaled=True) # enable Feature Scaling
data = diabetes.data # (m x n) (examples x features)

severity = diabetes.target.reshape((-1,1)) # output of severity of diabetes or disease progression
# mx1

print(data.dtype, severity.dtype) # show datatypes of input and output
print(data.shape) # show shape of input
print(diabetes.feature_names) # show feature names associated with input
print(data)# show data itself

float64 float64
(442, 10)
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990749
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06833155
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286131
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04688253
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452873
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00422151
   0.00306441]]


The goal here is show that linear regression can be solved with an analytical solution. This means that instead of doing gradient descent, we just compute a solution directly using math. There are also certain tricks used to simplify the problem a lot. 


In [20]:
onesarray = np.ones((data.shape[0])).reshape((-1,1))
data = np.hstack([data, onesarray]) # add bias feature to the data
print(data)

[[ 0.03807591  0.05068012  0.06169621 ...  0.01990749 -0.01764613
   1.        ]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.06833155 -0.09220405
   1.        ]
 [ 0.08529891  0.05068012  0.04445121 ...  0.00286131 -0.02593034
   1.        ]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.04688253  0.01549073
   1.        ]
 [-0.04547248 -0.04464164  0.03906215 ...  0.04452873 -0.02593034
   1.        ]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.00422151  0.00306441
   1.        ]]


One of the main tricks used here is to add 1 as a feature to all the data. Instead of having a separate bias term in the equations, if we add 1 as a feature and remove the bias term from the equation, then the weight associated with that 1 feature will become the bias. This is a clever trick used in machine learning and greatly simplies the involved equations while not chaning the functionality of the model at all. 

In [21]:
def predict(w,x):
    prediction = np.matmul(x, w) # no need to add bias term here
    return prediction

In [22]:
def mse_cost(w,x,y):
    m = x.shape[0] # number of training examples
    n = x.shape[1] # number of features
    
    prediction = np.matmul(x, w) # no need to add bias term here
    error = prediction - y
    error = error.reshape((-1)) # make it m (1d array) from mx1
    
    return np.dot(error, error) / (2*m)

Since we now don't have a bias term our predict and cost functions become easier to implement. One of the fundamental ideas in calculus optimization is to find wherever the cost function is at minimum, so we set the partial derivatives to become 0. If we use matrix math first however, then this will be simplified heavily. 

$$
\frac{dJ}{dw_j}= \frac{1}{m}\sum_{i=1}^{m}(f_w(X^i)-Y^i)X^i_j
$$
$$
f_w(x)=w_1*x_1+w_2*x_2+w_3*x_3+...+w_n*x_n
$$

Notice how there's no bias term in the actual prediction function. It's all part of the weights. The goal now is to re-write these equations into matrix form so we can remove the summation term and simplify heavily with just few matrix operations. 

$$
\frac{dJ}{dw_j}= \frac{1}{m}\sum_{i=1}^{m}(X^i \cdot w-Y^i)X^i_j
$$

$$
\frac{dJ}{dw} = \frac{1}{m}X^T(Xw-Y)
$$

The last equation is a compressed way to represent all the summations. We also take the transpose of X when multiplying by the other matrices. And now for the calculus, we set the partial derivatives to 0, and do matrix manipulations to find the weights.

$$
\frac{dJ}{dw} = 0
$$

$$
\frac{1}{m}X^T(Xw-Y) = 0
$$

$$
X^T(Xw-Y) = 0
$$

$$
X^TXw-X^TY = 0
$$

$$
X^TXw=X^TY
$$

Now to isolate w, we multiply each side by the inverse of the left matrices together. 

$$
\cancel{(X^TX)^{-1}X^TX}w=(X^TX)^{-1}X^TY
$$

$$
w=(X^TX)^{-1}X^TY
$$

And boom, we now have isolated the equation in terms of w. We can use this equation to fully solve the linear regression system.

In [23]:
def solve_weights(X, Y):
    XT = X.T # transpose of X
    xtx = np.matmul(XT, X)
    xtx_inv = np.linalg.inv(xtx)
    inv_xt = np.matmul(xtx_inv, XT)
    weights = np.matmul(inv_xt, Y)
    
    return weights

In [26]:
w = solve_weights(data, severity)
print("w:", w) # weights now include the bias term inside of it

w: [[ -10.0098663 ]
 [-239.81564367]
 [ 519.84592005]
 [ 324.3846455 ]
 [-792.17563855]
 [ 476.73902101]
 [ 101.04326794]
 [ 177.06323767]
 [ 751.27369956]
 [  67.62669218]
 [ 152.13348416]]


In [28]:
cost = mse_cost(w,data,severity) # compute new cost based on the weights we computed
print("Cost:", cost)

Cost: 1429.8481737933748


Let's use scikit to verify the solution we just got.

In [37]:
sgd = SGDRegressor(max_iter=100, penalty=None)
reg = make_pipeline(StandardScaler(), sgd)
reg.fit(data, severity.reshape((-1)))
sgdout = reg.predict(data).reshape((-1,1))

sgderror = (sgdout - severity).reshape((-1)) # compute errors from sgd predictions and actual values
sgdcost = np.dot(sgderror, sgderror) / (2*data.shape[0]) # compute minimized final cost
print("SGD Cost:", sgdcost)

SGD Cost: 1438.5400797446343


Look at that, very cool. Clearly we are so much better here and the analytical solution does work. This solution is great, but has drawbacks. It's important to only use the analytical solution when the input matrix is small. If the input had many more features then it would take much slower than gradient descent on the order of cubic time. For these large matrices, it's better to use gradient descent. Other than that, if the matrix is small, then we can get away by directly computing the solution. 