In [78]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Linear regression

### Formulas:
- Loss function: $$ RSS(w) = \dfrac{1}{n} \sum_{i=0}^{N}(y_i - \bar{y_i})^2 $$
- Partial derivative in w:
$$ D_w = \dfrac{-2}{n} \sum_{i=0}^{N}x_i(y_i -  \bar{y_i}) $$
- Partial derivative in b:
$$ D_b = \dfrac{-2}{n} \sum_{i=0}^{N}(y_i -  \bar{y_i}) $$

In [79]:
data = load_diabetes()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [102]:
def mse_loss_naive(w, b, X, y, alpha=0):
    """
    MSE loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # YOUR CODE HERE
    sum_dw = 0
    sum_db = 0
    sum_loss = 0
    n = len(X)
    for i in range(len(X)):
        y_i_pred = w @ X[i] + b
        sum_dw = sum_dw + X[i]*(y[i] - y_i_pred)
        sum_db = sum_db + (y[i] - y_i_pred)
        sum_loss = sum_loss + (y[i]    - (w @ X[i] + b))**2
    dw = -2 * sum_dw / n + 2*alpha*w
    db = -2 * sum_db / n 
    loss = sum_loss / n +  alpha*(w.T@w)
    
    return loss, dw, np.array(db).reshape(1,)

## Naive Linear regression loss

In [103]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  1.876890730921511e-16
Gradient check w
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -4.296088 analytic: -4.296087, relative error: 9.928392e-08
numerical: 2.892062 analytic: 2.892060, relative error: 2.621463e-07
numerical: -3.234125 analytic: -3.234125, relative error: 6.958758e-08
numerical: -4.145423 analytic: -4.145424, relative error: 1.018838e-07
numerical: -1.376396 analytic: -1.376394, relative error: 7.331243e-07
numerical: -4.145423 analytic: -4.145424, relative error: 1.018838e-07
numerical: -3.234125 analytic: -3.234125, relative error: 6.958758e-08
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -1.553186 analytic: -1.553188, relative error: 4.707326e-07
numerical: -1.553186 analytic: -1.553188, relative error: 4.707326e-07
numerical: -1.376396 analytic: -1.376394, relative error: 7.331243e-07
numerical: -1.275044 analytic: -1.275044, relative error: 1.153266e-07
numerical: -1.376396 analy

## Naive Ridge regression loss

In [104]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: 2.892370 analytic: 2.892369, relative error: 2.546077e-07
numerical: -3.234194 analytic: -3.234194, relative error: 6.220639e-08
numerical: -3.234194 analytic: -3.234194, relative error: 6.220639e-08
numerical: -1.275160 analytic: -1.275160, relative error: 1.011536e-07
numerical: -1.275160 analytic: -1.275160, relative error: 1.011536e-07
numerical: -1.376473 analytic: -1.376471, relative error: 7.260322e-07
numerical: -3.153157 analytic: -3.153157, relative error: 3.002676e-08
numerical: -3.153157 analytic: -3.153157, relative error: 3.002676e-08
numerical: -1.376473 analytic: -1.376471, relative error: 7.260322e-07
numerical: -1.553389 analytic: -1.553391, relative error: 4.835459e-07
numerical: -3.153157 analytic: -3.153157, relative error: 3.002676e-08
numerical: -3.153157 analytic: -3.153157, relative error: 3.002676e-08
numerical: -1.376473 analytic: -1.376471, relative error: 7.260322e-07
numerical: -3.234194 analytic: -3.234194, relative error: 6.22

### Formulas:
- Loss function: $$ RSS(w) = \dfrac{1}{n} \sum_{i=0}^{N}(y_i - \bar{y_i})^2 $$
- Partial derivative in w:
$$ D_w = \dfrac{-2}{n} \sum_{i=0}^{N}x_i(y_i -  \bar{y_i}) $$
- Partial derivative in b:
$$ D_b = \dfrac{-2}{n} \sum_{i=0}^{N}(y_i -  \bar{y_i}) $$

In [107]:
def mse_loss_vectorized(w, b, X, y, alpha=0):
    """
    MSE loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # YOUR CODE HERE
    n = len(X)
    dw = -2 * (y - (X@w + b)) @ X/n + 2*alpha*w
    db = -2 * np.sum((y -(X@w + b)), axis=0)/n
    loss = (y - (X@w + b)).T @ (y - (X@w + b))/n + alpha*(w.T@w)
    return loss, dw, np.array(db).reshape(1,)

## Vectorised Linear regression loss

In [108]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  6.256302436405037e-17
Gradient check w
numerical: -4.145425 analytic: -4.145424, relative error: 5.169430e-08
numerical: -3.153317 analytic: -3.153317, relative error: 1.438681e-08
numerical: 2.892061 analytic: 2.892060, relative error: 1.678024e-07
numerical: -1.275043 analytic: -1.275044, relative error: 2.413257e-07
numerical: -0.315454 analytic: -0.315454, relative error: 1.136560e-08
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -1.275043 analytic: -1.275044, relative error: 2.413257e-07
numerical: -4.145425 analytic: -4.145424, relative error: 5.169430e-08
numerical: -1.275043 analytic: -1.275044, relative error: 2.413257e-07
numerical: -1.553188 analytic: -1.553188, relative error: 2.279255e-09
numerical: -4.145425 analytic: -4.145424, relative error: 5.169430e-08
numerical: -3.234125 analytic: -3.234125, relative error: 1.334395e-08
numerical: -4.296088 analy

In [109]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  6.256302436405037e-17
Gradient check w
numerical: -0.315454 analytic: -0.315454, relative error: 1.136560e-08
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -3.234125 analytic: -3.234125, relative error: 1.334395e-08
numerical: -3.234125 analytic: -3.234125, relative error: 1.334395e-08
numerical: -4.296088 analytic: -4.296087, relative error: 1.460272e-08
numerical: -1.275043 analytic: -1.275044, relative error: 2.413257e-07
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -4.296088 analytic: -4.296087, relative error: 1.460272e-08
numerical: -1.376395 analytic: -1.376394, relative error: 1.384222e-07
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -3.153317 analytic: -3.153317, relative error: 1.438681e-08
numerical: -2.801914 analytic: -2.801914, relative error: 3.056307e-08
numerical: -4.145425 analytic: -4.145424, relative error: 5.169430e-08
numerical: -1.553188 ana

## Vectorized ridge regression loss

In [110]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -0.315485 analytic: -0.315485, relative error: 1.334629e-07
numerical: -4.296081 analytic: -4.296081, relative error: 9.291373e-09
numerical: -4.145219 analytic: -4.145218, relative error: 5.395556e-08
numerical: 2.892370 analytic: 2.892369, relative error: 1.602739e-07
numerical: -3.234194 analytic: -3.234194, relative error: 5.963953e-09
numerical: -4.296081 analytic: -4.296081, relative error: 9.291373e-09
numerical: -3.153157 analytic: -3.153157, relative error: 1.182815e-09
numerical: -2.802019 analytic: -2.802019, relative error: 3.359216e-08
numerical: 2.892370 analytic: 2.892369, relative error: 1.602739e-07
numerical: -4.145219 analytic: -4.145218, relative error: 5.395556e-08
numerical: 2.892370 analytic: 2.892369, relative error: 1.602739e-07
numerical: -4.145219 analytic: -4.145218, relative error: 5.395556e-08
numerical: -1.275159 analytic: -1.275160, relative error: 2.554663e-07
numerical: -1.553391 analytic: -1.553391, relative error: 1.515378

# Logistic regression

### Formulas for logistic regression:
- Loss for logistic regression $$Loss(w) = -\frac{1}{N}[y\log(\sigma(wx + b)) + (1 - y)\log(1 - \sigma(wx + b))]$$
- Partial derivative in w$$ \dfrac {dLoss(w)}{dw_j} = \frac{1}{N}[\sigma(wx + b) - y]x_j $$
- Partial derivative in b$$ \dfrac {dLoss(w)}{db} = \frac{1}{N}[\sigma(wx + b) - y] $$

In [87]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

data = load_breast_cancer()
X_train2, y_train2 = data.data, data.target
w2 = np.random.randn(X_train2.shape[1]) * 0.0001
b2 = np.random.randn(1) * 0.0001

In [88]:
import math
def log_loss_naive(w, b, X, y, alpha=0):
    """
    log loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0
    n = len(X)
    for i in range(len(X)):
        loss = loss - (y[i] * math.log(sigmoid(X[i] @ w + b)) + (1-y[i]) * math.log(1-sigmoid(X[i] @ w + b)) )/n + alpha * (w.T @ w)
        dw = dw + (sigmoid(X[i] @ w + b) - y[i] )*X[i]/n + 2 * alpha * w
        db = db + (sigmoid(X[i] @ w + b) - y[i] )/n
        
    return loss, dw, np.array(db).reshape(1,)

In [89]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  3.181128844370586e-16
Gradient check w
numerical: 0.027126 analytic: 0.027126, relative error: 8.518583e-10
numerical: -0.000969 analytic: -0.000969, relative error: 6.822252e-09
numerical: -0.007812 analytic: -0.007812, relative error: 2.533635e-09
numerical: 0.011255 analytic: 0.011255, relative error: 1.485902e-09
numerical: 0.027126 analytic: 0.027126, relative error: 8.518583e-10
numerical: -0.396888 analytic: -0.396888, relative error: 1.591155e-07
numerical: 0.011255 analytic: 0.011255, relative error: 1.485902e-09
numerical: -0.017990 analytic: -0.017990, relative error: 6.032011e-10
numerical: -0.000231 analytic: -0.000231, relative error: 5.077697e-08
numerical: 0.033353 analytic: 0.033353, relative error: 5.648010e-10
numerical: -0.176035 analytic: -0.176035, relative error: 1.097467e-09
numerical: -0.007812 analytic: -0.007812, relative error: 2.533635e-09
numerical: -0.000969 analytic: -0.000969, relative error: 6.822252e-09
numerical: 0.008658 analytic: 0.00

# Naive with regulariztion

In [90]:
loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: 0.284840 analytic: 0.284840, relative error: 8.546001e-10
numerical: -0.567309 analytic: -0.567309, relative error: 3.600676e-10
numerical: 0.085603 analytic: 0.085603, relative error: 9.074147e-10
numerical: -0.063277 analytic: -0.063277, relative error: 2.276913e-09
numerical: 0.025017 analytic: 0.025017, relative error: 2.830086e-09
numerical: -0.063277 analytic: -0.063277, relative error: 2.276913e-09
numerical: 41.080936 analytic: 41.081001, relative error: 7.984152e-07
numerical: -0.026063 analytic: -0.026063, relative error: 3.744631e-09
numerical: -0.127078 analytic: -0.127078, relative error: 2.807716e-09
numerical: 0.049020 analytic: 0.049020, relative error: 1.752950e-09
numerical: 0.169846 analytic: 0.169846, relative error: 9.455659e-10
numerical: 0.026036 analytic: 0.026036, relative error: 3.751112e-11
numerical: 0.111200 analytic: 0.111200, relative error: 4.796508e-10
numerical: 0.025017 analytic: 0.025017, relative error: 2.830086e-09
numer

# Vectorized

### Formulas for logistic regression:
- Loss for logistic regression $$Loss(w) = -\frac{1}{N}[y\log(\sigma(wx + b)) + (1 - y)\log(1 - \sigma(wx + b))]$$
- Partial derivative in w$$ \dfrac {dLoss(w)}{dw_j} = \frac{1}{N}[\sigma(wx + b) - y]x_j $$
- Partial derivative in b$$ \dfrac {dLoss(w)}{db} = \frac{1}{N}[\sigma(wx + b) - y] $$

In [91]:
def log_loss_vectorized(w, b,X, y, alpha=0):
    """
    log loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0
    n = len(X)
    # YOUR CODE HERE
    loss = -(y @ np.log(sigmoid(X@w + b)) + (1 - y)@np.log(1 - sigmoid(X@w + b)))/n + alpha * (w.T @ w)
    dw = ((sigmoid(X@w + b) - y) @ X)/n + 2 * alpha * w
    db = np.sum((sigmoid(X@w + b) - y))/n
    return loss, dw, np.array(db).reshape(1,)

In [92]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  7.952822110926462e-17
Gradient check w
numerical: 0.027126 analytic: 0.027126, relative error: 1.356172e-10
numerical: -0.152612 analytic: -0.152612, relative error: 2.319178e-11
numerical: 0.008658 analytic: 0.008658, relative error: 1.362717e-10
numerical: -0.396888 analytic: -0.396888, relative error: 1.591854e-07
numerical: -0.002567 analytic: -0.002567, relative error: 1.092139e-10
numerical: -0.152612 analytic: -0.152612, relative error: 2.319178e-11
numerical: -0.007542 analytic: -0.007542, relative error: 4.277628e-10
numerical: -0.396888 analytic: -0.396888, relative error: 1.591854e-07
numerical: -0.000211 analytic: -0.000211, relative error: 5.570658e-09
numerical: 0.013807 analytic: 0.013807, relative error: 3.394843e-11
numerical: -0.017990 analytic: -0.017990, relative error: 1.393700e-11
numerical: -1.806660 analytic: -1.806660, relative error: 2.346240e-10
numerical: -0.007812 analytic: -0.007812, relative error: 3.086581e-10
numerical: -1.514292 analytic:

# Vectorized with regularization

In [93]:
loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: 0.013828 analytic: 0.013828, relative error: 1.930527e-11
numerical: -1.514153 analytic: -1.514153, relative error: 1.117638e-10
numerical: -0.011517 analytic: -0.011517, relative error: 2.638843e-10
numerical: 0.013828 analytic: 0.013828, relative error: 1.930527e-11
numerical: 0.008600 analytic: 0.008600, relative error: 3.398372e-10
numerical: -0.000031 analytic: -0.000031, relative error: 7.759122e-08
numerical: 0.013828 analytic: 0.013828, relative error: 1.930527e-11
numerical: -2.551054 analytic: -2.551054, relative error: 1.223369e-08
numerical: 41.121829 analytic: 41.121895, relative error: 7.976202e-07
numerical: -0.001148 analytic: -0.001148, relative error: 2.658985e-09
numerical: -0.000239 analytic: -0.000239, relative error: 1.247142e-08
numerical: 95.768081 analytic: 95.768340, relative error: 1.351411e-06
numerical: 0.013828 analytic: 0.013828, relative error: 1.930527e-11
numerical: -0.007485 analytic: -0.007485, relative error: 3.022149e-10

# Gradient descent for Linear models

In [100]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,) 
            # Hint: Use np.random.choice to generate indices
            # YOUR CODE HERE
            n = len(X)
            index = np.random.choice(range(n), size=batch_size)
            X_batch = X[index]
            y_batch = y[index]

            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w and bias b using the gradient and the learning rate. 
            # YOUR CODE HERE
            self.w = self.w - learning_rate * dw
            self.b = self.b - learning_rate * db
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
        
        print("Last loss = ", loss )
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class LinearRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        # YOUR CODE HERE
        y_predict = X@self.w + self.b
        return y_predict

class LogisticRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return log_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        """ Return prediction labels vector of 0 or 1 """
        # YOUR CODE HERE
        proba_predict = sigmoid(self.w@X.T + self.b)
        proba_predict[proba_predict>0.5] = 1
        proba_predict[proba_predict<=0.5] = 0
        return proba_predict

## Linear regression with gradient descent

In [98]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 24195.788156
iteration 10000 / 75000: loss 3706.175791
iteration 20000 / 75000: loss 3892.324140
iteration 30000 / 75000: loss 3830.015595
iteration 40000 / 75000: loss 3465.152712
iteration 50000 / 75000: loss 2669.135149
iteration 60000 / 75000: loss 2184.448164
iteration 70000 / 75000: loss 2687.917981
Last loss =  3405.3924988387635
MSE scikit-learn: 2859.6903987680657
MSE gradient descent model : 2884.3613792931287


## Logistc regression with gradient descent

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)

sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_log_loss = log_loss(sk_pred, y_train2)

model = LogisticRegressor()
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_log_loss = log_loss(pred, y_train2)

print("Log-loss scikit-learn:", sk_log_loss)
print("Log-loss gradiet descent model :", model_log_loss)
print("Error :", rel_error(sk_log_loss, model_log_loss))
assert rel_error(sk_log_loss, model_log_loss) < 1e-7

iteration 0 / 75000: loss 0.694033
iteration 10000 / 75000: loss 0.076443
iteration 20000 / 75000: loss 0.106687
iteration 30000 / 75000: loss 0.048231
iteration 40000 / 75000: loss 0.077529
iteration 50000 / 75000: loss 0.087962
iteration 60000 / 75000: loss 0.165647
iteration 70000 / 75000: loss 0.036883
Last loss =  0.1092023058226832
Log-loss scikit-learn: 0.4249086712816093
Log-loss gradiet descent model : 0.4249086712816093
Error : 0.0
