## Linear regression
$$ Input(X,y) \mapsto Generalization(\mathcal{w}X = y) \mapsto Prediction(\hat{y}) $$
- Loss function:
$$ \mathcal{L}(\mathcal{w}) = \frac{1}{2}\sum_{i=1}^n(y_i -x_i\mathcal{w})$$
$$ \mathcal{w}^* = \underset{\mathcal{w}}{\arg\min}(\mathcal{L}(\mathcal{w}))$$
- Find W:
> - Normal Equation: $$ \mathcal{w}^* = (X^T.X)^{-1}.X^T.y $$
> - Gradient Descent:(modification: mini-batch, stochastic)
$$\theta_{t+1} = \theta_{t} - \eta \nabla_{\mathbf{w}}\mathcal{L}(\mathbf{w}))$$
$$\nabla_{\mathbf{w}}\mathcal{L}(\mathbf{w}) = 
\frac{1}{N}\mathbf{\bar{X}}^T \mathbf{(\bar{X}w - y)}$$
___
- Prevent overfit:
> - Ridge Regression: penalizes large weights
> - Lasso Regression: yield sparse models
> - Elastic Net: l1 + l2
> - Early Stop: stop before error of validation increase
___
- Note:
___
- Resources:
> - Book: Hands_on_machine_learning
> - Blog: [Machinelearningcoban](https://machinelearningcoban.com/2016/12/28/linearregression/)
> - CS229 note

# Linear regression

In [1]:
# createa a dataset
#---------------------------------------------------#

import numpy as np

np.random.seed(1993)
X = 2 * np.random.rand(100,1)
y = 4 + 3 * X +np.random.rand(100,1)
X_new = [[0], [3]]

>### Use Linearegression from sklearn

In [2]:
# use Linearegression from sklearn
#---------------------------------------------------#

from sklearn.linear_model import LinearRegression

lng = LinearRegression(normalize = True)
lng.fit(X,y)
print("intercept: {}".format(lng.intercept_),
     "coef: {}".format(lng.coef_),
     "score: {}".format(lng.score(X, y)))

intercept: [ 4.48409354] coef: [[ 3.02083152]] score: 0.9717836808399598


___
score return coef of R^2: (1 - u/v) (bestscore is 1.0)
- u: ((y_true - y_pred) ** 2).sum()
- v: ((y_true - y_true.mean()) ** 2).sum()

In [3]:
lng.predict(X_new)

array([[  4.48409354],
       [ 13.54658809]])

>### Normal equation

In [4]:
def linear(X, y):
    X_b = np.c_[np.ones(np.shape(X)), X]
    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
    return theta

def lng_predict(X, y, X_test):
    theta = linear(X, y)
    X_test = np.c_[np.ones(np.shape(X_test)), X_test]
    y_predict = X_test.dot(theta)
    return y_predict

In [5]:
linear(X, y)

array([[ 4.48409354],
       [ 3.02083152]])

>### Use GD

In [6]:
# find theta by gd:
#---------------------------------------------------#

def find_theta(X, y, n):
    '''n is n_interation'''
    m = np.shape(X)[0]
    number_of_feature = np.shape(X)[1] + 1
    theta = np.zeros((number_of_feature, 1))
    eta = 0.1
    X_b = np.c_[np.ones((np.shape(X)[0], 1)), X]
    for i in range(n):
        garidents = 1 / m * X_b.T.dot(X_b.dot(theta) - y)
        theta = theta - eta * garidents
    return theta
# use grid search to find good learning rate (eta)

In [7]:
find_theta(X, y, 1000)

array([[ 4.48409229],
       [ 3.02083255]])

>### Usr SGD

In [8]:
# Stochastic GD
#---------------------------------------------------#

def SGD(X, y, n):
    '''n is number of epoch'''
    eta = 0.1
    m = np.shape(X)[0]
    theta = np.zeros((np.shape(X)[1] + 1, 1))
    X_b = np.c_[np.ones((np.shape(X)[0],1)), X]
    for epoch in range(n):
        list_index = np.random.permutation(range(m))
        for i in list_index:
            x_i = X_b[i:i+1]
            y_i = y[i:i+1]
            gradient = 2 * x_i.T.dot(x_i.dot(theta) - y_i)
            theta = theta - eta * gradient
    return theta

In [9]:
SGD(X, y, 50)

array([[ 4.46781139],
       [ 2.99121631]])

>### Use SGDregressor in sklearn

In [10]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(n_iter = 50, penalty = None, eta0 = 0.1)
sgd_reg.fit(X, y.ravel())
print(sgd_reg.intercept_, sgd_reg.coef_)

[ 4.49453729] [ 3.03111343]




In [12]:
y_predict = lng_predict(X, y, X_new)
print(y_predict)

[[  4.48409354]
 [ 13.54658809]]


# Polynomial Regression

In [14]:
# make a data set
#---------------------------------------------------#

m = 100
X = np.random.rand(m,1)
y = 0.5 * X**2 + X + 2 +np.random.rand(m, 1)

In [15]:
# poly transform
#---------------------------------------------------#

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, include_bias = False)
X_poly = poly.fit_transform(X)

In [16]:
# use linearregression to fit data
#---------------------------------------------------#

lng.fit(X_poly, y)
print(lng.intercept_, lng.coef_)

[ 2.49850833] [[ 0.70756463  0.74012577]]


 ___
 - To prevent overfilling, stop when the increase of degree just change a little bit in error

# Regularized linnear model

> ### Early stop

In [20]:
from sklearn.base import clone

def early_stop(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    sgd_reg = SGDRegressor(n_iter = 1, eta0 = 0.0005, penalty = None,
                          warm_start = True, learning_rate = 'constant')
    #n_iter =1 but warm_start = True, sgd will countinues where it left off

    minimun_val_error = float('inf')
    best_epoch = None
    best_model = None
    for epoch in range(1000):
        sgd_reg.fit(X_train, y_train)
        y_predict_sgd = sgd_reg.predict(X_test)
        val_error = mean_squared_error(y_predict_sgd, y_test)
        if val_error < minimun_val_error:
            minimun_val_error = val_error
            best_epoch = epoch
            best_model = clone(sgd_reg)
    return minimun_val_error, best_epoch, best_model

val_error, best_epoch, best_model = early_stop(X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [21]:
print(val_error, best_epoch)

0.106337309489 999
