In [None]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Linear regression

In [None]:
data = load_diabetes()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [None]:
def mse_loss_naive(w, b, X, y, alpha=0):
    """
    MSE loss function WITH FOR LOOPs

    Returns a tuple of:
    - loss
    - gradient with respect to weights w
    - gradient with respect to bias b
    """

    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0

    # YOUR CODE HERE
    size_y = len(y)
    y_pred = X.dot(w) + b

    # loss
    for i in range(size_y):
        loss = loss + (y[i] - y_pred[i])**2
    loss += alpha * w.T.dot(w)
    loss *= (1 / size_y)

    # gradient with respect to weights w
    for i in range(size_y):
        dw += X[i] * (y[i] - y_pred[i])
    dw += alpha * w
    dw *= (-2 / size_y)

    # gradient with respect to weights b
    for i in range(size_y):
        db += (y[i] - y_pred[i])
    db *= (-2 / size_y)

    return loss, dw, np.array(db).reshape(1,)

## Naive Linear regression loss

In [None]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: -4.145419 analytic: -4.145419, relative error: 5.118220e-08
numerical: -4.296089 analytic: -4.296088, relative error: 1.361039e-07
numerical: -1.275046 analytic: -1.275045, relative error: 3.422841e-07
numerical: -0.315455 analytic: -0.315454, relative error: 9.650044e-07
numerical: -3.153319 analytic: -3.153318, relative error: 1.342542e-07
numerical: -1.275046 analytic: -1.275045, relative error: 3.422841e-07
numerical: -3.153319 analytic: -3.153318, relative error: 1.342542e-07
numerical: -1.376395 analytic: -1.376395, relative error: 2.422274e-07
numerical: -3.234109 analytic: -3.234110, relative error: 2.105461e-07
numerical: -1.553189 analytic: -1.553189, relative error: 2.037040e-07
numerical: -1.275046 analytic: -1.275045, relative error: 3.422841e-07
numerical: -4.296089 analytic: -4.296088, relative error: 1.361039e-07
numerical: -1.275046 analytic: -1.275045, relative error: 3.422841e-07
numerical: -1.275046 analytic: -1.275045, 

## Naive Ridge regression loss

In [None]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: 2.892061 analytic: 2.892060, relative error: 7.955801e-08
numerical: -1.553189 analytic: -1.553188, relative error: 3.085007e-07
numerical: -2.801915 analytic: -2.801914, relative error: 1.962322e-07
numerical: -0.315455 analytic: -0.315455, relative error: 2.500889e-07
numerical: -3.153320 analytic: -3.153318, relative error: 2.750109e-07
numerical: -1.275046 analytic: -1.275044, relative error: 8.196173e-07
numerical: -1.275046 analytic: -1.275044, relative error: 8.196173e-07
numerical: -1.553189 analytic: -1.553188, relative error: 3.085007e-07
numerical: -1.376396 analytic: -1.376394, relative error: 7.002880e-07
numerical: -1.275046 analytic: -1.275044, relative error: 8.196173e-07
numerical: -1.275046 analytic: -1.275044, relative error: 8.196173e-07
numerical: -2.801915 analytic: -2.801914, relative error: 1.962322e-07
numerical: -1.275046 analytic: -1.275044, relative error: 8.196173e-07
numerical: -2.801915 analytic: -2.801914, relative error: 1.96

In [None]:
def mse_loss_vectorized(w, b, X, y, alpha=0):
    """
    MSE loss function WITHOUT FOR LOOPs

    Returns a tuple of:
    - loss
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0

    # YOUR CODE HERE
    y_pred = X.dot(w) + b

    # loss
    s1 =  (y_pred - y).T @ (y_pred - y) + alpha * (w.T @ w)
    loss =  (1/(y.shape[0])) * s1

    # gradient with respect to weights w
    s1 = alpha * w
    s2 = X.T @ (y_pred - y)
    dw = (s1 + s2) * (2/y.shape[0])

    # gradient with respect to weights b
    s3 = np.sum(y_pred - y)
    db = s3 * (2/y.shape[0])

    return loss, dw, np.array(db).reshape(1,)

## Vectorised Linear regression loss

In [None]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  1.2512632702993494e-16
Gradient check w
numerical: -1.376395 analytic: -1.376395, relative error: 8.816271e-08
numerical: -4.145419 analytic: -4.145419, relative error: 7.302691e-09
numerical: -0.315454 analytic: -0.315454, relative error: 7.648710e-07
numerical: -0.315454 analytic: -0.315454, relative error: 7.648710e-07
numerical: -1.553188 analytic: -1.553189, relative error: 3.052239e-08
numerical: -0.315454 analytic: -0.315454, relative error: 7.648710e-07
numerical: 2.892061 analytic: 2.892061, relative error: 5.626271e-08
numerical: -3.153318 analytic: -3.153318, relative error: 1.888435e-08
numerical: 2.892061 analytic: 2.892061, relative error: 5.626271e-08
numerical: -0.315454 analytic: -0.315454, relative error: 7.648710e-07
numerical: -0.315454 analytic: -0.315454, relative error: 7.648710e-07
numerical: -2.801914 analytic: -2.801914, relative error: 5.239719e-08
numerical: -4.296088 analytic: -4.296088, relative error: 9.082135e-09
numerical: -3.153318 analyt

## Vectorized ridge regression loss

In [None]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -3.153319 analytic: -3.153319, relative error: 2.233997e-08
numerical: -3.234110 analytic: -3.234110, relative error: 2.315559e-08
numerical: -1.553188 analytic: -1.553189, relative error: 7.676247e-08
numerical: -4.145418 analytic: -4.145419, relative error: 9.654628e-09
numerical: -4.145418 analytic: -4.145419, relative error: 9.654628e-09
numerical: -2.801915 analytic: -2.801914, relative error: 3.840119e-08
numerical: -4.296088 analytic: -4.296088, relative error: 4.224163e-09
numerical: -4.296088 analytic: -4.296088, relative error: 4.224163e-09
numerical: -3.153319 analytic: -3.153319, relative error: 2.233997e-08
numerical: -4.145418 analytic: -4.145419, relative error: 9.654628e-09
numerical: -1.553188 analytic: -1.553189, relative error: 7.676247e-08
numerical: -2.801915 analytic: -2.801914, relative error: 3.840119e-08
numerical: -1.275045 analytic: -1.275045, relative error: 2.063795e-07
numerical: -1.553188 analytic: -1.553189, relative error: 7.

# Logistic regression

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

data = load_breast_cancer()
X_train2, y_train2 = data.data, data.target
w2 = np.random.randn(X_train2.shape[1]) * 0.0001
b2 = np.random.randn(1) * 0.0001

# Naive

In [None]:
def logistic(y, m):
    res = y * np.log(m) + (1-y) * np.log(1-m)
    return res

In [None]:
def log_loss_naive(w, b, X, y, alpha=0):
    """
    log loss function WITH FOR LOOPs

    Returns a tuple of:
    - loss
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0

    # YOUR CODE HERE

    # loss
    for i in range(y.shape[0]):
        sig = sigmoid(w.T.dot(X[i]) + b)
        loss -= logistic(y[i], sig)

    loss += alpha * w.T.dot(w)
    loss = (1/y.shape[0]) * loss

    # gradient with respect to weights w
    for i in range(w.shape[0]):
        s1 = 2 * alpha * w[i]
        for j in range(y.shape[0]):
            sig = sigmoid((w.T @ X[j]) + b)
            s1 += (sig - y[j]) * X[j][i]
        dw[i] = s1/y.shape[0]

    # gradient with respect to bias b
    s2 = 0
    for i in range(y.shape[0]):
        sig = sigmoid((w.T @ X[i]) + b)
        s2 += sig - y[i]
    db = s2 * (1/y.shape[0])

    return loss, dw, np.array(db).reshape(1,)

In [None]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  4.0125104342358195e-16
Gradient check w
numerical: -0.010376 analytic: -0.010376, relative error: 1.247357e-09
numerical: -0.646736 analytic: -0.646736, relative error: 1.572244e-10
numerical: -0.000405 analytic: -0.000405, relative error: 8.698469e-08
numerical: -2.057303 analytic: -2.057303, relative error: 2.221650e-10
numerical: -0.008380 analytic: -0.008380, relative error: 1.112005e-09
numerical: -2.057303 analytic: -2.057303, relative error: 2.221650e-10
numerical: -0.000361 analytic: -0.000361, relative error: 1.349820e-08
numerical: 0.021346 analytic: 0.021346, relative error: 3.417589e-10
numerical: -0.010376 analytic: -0.010376, relative error: 1.247357e-09
numerical: 0.007945 analytic: 0.007945, relative error: 1.414640e-09
numerical: 0.021346 analytic: 0.021346, relative error: 3.417589e-10
numerical: 0.009822 analytic: 0.009822, relative error: 9.605254e-10
numerical: 0.014823 analytic: 0.014823, relative error: 1.395529e-09
numerical: -1.703354 analytic: -1

# Naive with regulariztion

In [None]:
loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: 0.014823 analytic: 0.014823, relative error: 1.424336e-09
numerical: -2.057303 analytic: -2.057303, relative error: 2.233022e-10
numerical: -0.000404 analytic: -0.000404, relative error: 7.878113e-08
numerical: -0.163838 analytic: -0.163838, relative error: 9.669251e-11
numerical: -0.163838 analytic: -0.163838, relative error: 9.669251e-11
numerical: -0.364052 analytic: -0.364052, relative error: 5.654486e-10
numerical: -1.703354 analytic: -1.703354, relative error: 1.273177e-10
numerical: -0.646736 analytic: -0.646736, relative error: 1.557427e-10
numerical: -0.010376 analytic: -0.010376, relative error: 1.316386e-09
numerical: -2.057303 analytic: -2.057303, relative error: 2.233022e-10
numerical: -1.649472 analytic: -1.649473, relative error: 3.665622e-08
numerical: 0.014823 analytic: 0.014823, relative error: 1.424336e-09
numerical: -0.000404 analytic: -0.000404, relative error: 7.878113e-08
numerical: 0.021345 analytic: 0.021345, relative error: 2.868659

# Vectorized

In [None]:
def log_loss_vectorized(w, b,X, y, alpha=0):
    """
    log loss function WITHOUT FOR LOOPs

    Returns a tuple of:
    - loss
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0

    # YOUR CODE HERE
    # loss
    sig = sigmoid(X.dot(w)+b)
    loss -= sum( y * np.log(sig) + (1-y) * np.log(1-sig) )
    s1 = alpha * w.T.dot(w)
    loss = (loss + s1) / y.shape[0]

    # gradient with respect to weights w
    s1 = 2 * alpha * w
    s2 = X.T.dot(sig - y)
    dw = (s1 + s2)/y.shape[0]

    # gradient with respect to bias b
    s3 = sum(sig - y)
    db = s3 / y.shape[0]

    return loss, dw, np.array(db).reshape(1,)

In [None]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  3.210008347388655e-16
Gradient check w
numerical: -1.703354 analytic: -1.703354, relative error: 1.282674e-10
numerical: -3.588433 analytic: -3.588433, relative error: 9.748116e-09
numerical: -0.163838 analytic: -0.163838, relative error: 7.738657e-11
numerical: 0.010976 analytic: 0.010976, relative error: 1.048020e-09
numerical: 0.021346 analytic: 0.021346, relative error: 3.417581e-10
numerical: 0.009822 analytic: 0.009822, relative error: 9.605254e-10
numerical: -0.000867 analytic: -0.000867, relative error: 5.226022e-09
numerical: -0.000405 analytic: -0.000405, relative error: 8.012872e-08
numerical: -0.019708 analytic: -0.019708, relative error: 3.571989e-10
numerical: 0.014823 analytic: 0.014823, relative error: 1.582774e-09
numerical: -0.008380 analytic: -0.008380, relative error: 1.112005e-09
numerical: -0.019708 analytic: -0.019708, relative error: 3.571989e-10
numerical: 0.021346 analytic: 0.021346, relative error: 3.417581e-10
numerical: 6.546044 analytic: 6.54

# Vectorized with regularization

In [None]:
loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: 0.007945 analytic: 0.007945, relative error: 6.964529e-10
numerical: 0.156493 analytic: 0.156493, relative error: 8.214316e-11
numerical: -1.649472 analytic: -1.649473, relative error: 3.665622e-08
numerical: 0.021345 analytic: 0.021345, relative error: 2.868651e-10
numerical: -0.163838 analytic: -0.163838, relative error: 7.975107e-11
numerical: -0.001032 analytic: -0.001032, relative error: 4.649319e-09
numerical: -3.588432 analytic: -3.588432, relative error: 9.748412e-09
numerical: -0.026106 analytic: -0.026106, relative error: 8.136052e-10
numerical: -0.000404 analytic: -0.000404, relative error: 8.564436e-08
numerical: 6.546044 analytic: 6.546044, relative error: 1.416425e-08
numerical: -0.163838 analytic: -0.163838, relative error: 7.975107e-11
numerical: -0.019707 analytic: -0.019707, relative error: 3.651731e-10
numerical: -0.163838 analytic: -0.163838, relative error: 7.975107e-11
numerical: -0.000866 analytic: -0.000866, relative error: 4.650089e-

# Gradient descent for Linear models

In [None]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape

        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w

        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None

            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)
            # Hint: Use np.random.choice to generate indices
            rand = np.random.choice(N, batch_size, replace=False)

            X_batch = X[rand, :]
            y_batch = y[rand]

            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update
            # Update the weights w and bias b using the gradient and the learning rate.
            self.w -= learning_rate * dw
            self.b -= learning_rate * db

            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))

        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class LinearRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)

    def predict(self, X):
        y = X.dot(self.w) + self.b
        return y

class LogisticRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return log_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)

    def predict(self, X):
        """ Return prediction labels vector of 0 or 1 """
        # YOUR CODE HERE
        y = X.dot(self.w) + self.b
        r = np.zeros_like(y)
        for i in range(len(y)):
            r[i] = 1 if y[i]>0 else 0
        return r

## Linear regression with gradient descent

In [None]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 25977.313070
iteration 10000 / 75000: loss 3818.439766
iteration 20000 / 75000: loss 2990.781528
iteration 30000 / 75000: loss 2259.751556
iteration 40000 / 75000: loss 3564.815899
iteration 50000 / 75000: loss 3599.266503
iteration 60000 / 75000: loss 3402.548861
iteration 70000 / 75000: loss 2694.861284
MSE scikit-learn: 2859.69634758675
MSE gradient descent model : 2884.4364378534956


## Logistc regression with gradient descent

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])
import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_trans
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])
form(X_train2)

sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_log_loss = log_loss(sk_pred, y_train2)

model = LogisticRegressor()
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_log_loss = log_loss(pred, y_train2)

print("Log-loss scikit-learn:", sk_log_loss)
print("Log-loss gradiet descent model :", model_log_loss)
print("Error :", rel_error(sk_log_loss, model_log_loss))
assert rel_error(sk_log_loss, model_log_loss) < 1e-7

iteration 0 / 75000: loss 0.694113
iteration 10000 / 75000: loss 0.088341
iteration 20000 / 75000: loss 0.030155
iteration 30000 / 75000: loss 0.138437
iteration 40000 / 75000: loss 0.037893
iteration 50000 / 75000: loss 0.051293
iteration 60000 / 75000: loss 0.039052
iteration 70000 / 75000: loss 0.097594
Log-loss scikit-learn: 0.44341928598210933
Log-loss gradiet descent model : 0.44341928598210933
Error : 0.0
