In [3]:
from random import randrange
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import load_breast_cancer, load_diabetes


def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5, error=1e-9):
    """
    sample a few random elements and only return numerical
    in this dimensions
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evaluate f(x + h)
        x[ix] = oldval - h  # increment by h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (
            abs(grad_numerical) + abs(grad_analytic)
        )
        print(
            "numerical: %f analytic: %f, relative error: %e"
            % (grad_numerical, grad_analytic, rel_error)
        )
        assert rel_error < error

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Linear regression

In [4]:
data = load_diabetes()
X_train1, y_train1 = data.data, data.target
w1 = np.random.randn(X_train1.shape[1]) * 0.0001
b1 = np.random.randn(1) * 0.0001

In [94]:
def mse_loss_naive(w, b, X, y, alpha=0):
    """
    MSE loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    #dw
    for i in range(w.shape[0]):
        dw[i] = 2 * alpha*w[i]
        for j in range(y.shape[0]):
            dw[i] += (w.T@X[j] - y[j] + b)*X[j][i] 
           
    dw = (2/y.shape[0])*dw
    for i in range(w.shape[0]):
        dw[i] += 2*alpha*w[i]
        
    #db
    tmp = 0
    for j in range(y.shape[0]):
        tmp += (w.T@X[j] - y[j]) + b
    db = (2/y.shape[0])*tmp
    
    #loss
    tmp = 0
    for j in range(y.shape[0]):
        loss += (y[j] - (w.T@X[j] + b))**2
        
    loss = (1/y.shape[0])*loss
    
    for i in range(w.shape[0]):
        loss += alpha * w[i]**2
    
    return loss, dw, np.array(db).reshape(1,)

## Naive Linear regression loss

In [95]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  1.8768911494382515e-16
Gradient check w
numerical: -3.234127 analytic: -3.234125, relative error: 2.927403e-07
numerical: -2.801915 analytic: -2.801913, relative error: 3.603976e-07
numerical: -0.315454 analytic: -0.315454, relative error: 4.125827e-07
numerical: 2.892059 analytic: 2.892060, relative error: 1.620784e-07
numerical: -1.553188 analytic: -1.553187, relative error: 2.317481e-07
numerical: -1.553188 analytic: -1.553187, relative error: 2.317481e-07
numerical: 2.892059 analytic: 2.892060, relative error: 1.620784e-07
numerical: -3.153316 analytic: -3.153316, relative error: 2.185089e-08
numerical: -1.275045 analytic: -1.275043, relative error: 6.115693e-07
numerical: -4.145424 analytic: -4.145423, relative error: 1.398897e-07
numerical: -3.153316 analytic: -3.153316, relative error: 2.185089e-08
numerical: -0.315454 analytic: -0.315454, relative error: 4.125827e-07
numerical: -1.376393 analytic: -1.376394, relative error: 2.454919e-07
numerical: -0.315454 analyt

## Naive Ridge regression loss

In [96]:
loss, dw1, db1 = mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_naive(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -3.234310 analytic: -3.234309, relative error: 1.800654e-07
numerical: -3.234310 analytic: -3.234309, relative error: 1.800654e-07
numerical: -1.376516 analytic: -1.376517, relative error: 4.678099e-07
numerical: -0.315506 analytic: -0.315505, relative error: 1.215614e-07
numerical: 2.892047 analytic: 2.892048, relative error: 1.420538e-07
numerical: -1.376516 analytic: -1.376517, relative error: 4.678099e-07
numerical: -1.552924 analytic: -1.552922, relative error: 6.240039e-07
numerical: -0.315506 analytic: -0.315505, relative error: 1.215614e-07
numerical: -1.275260 analytic: -1.275260, relative error: 2.650219e-07
numerical: -4.296261 analytic: -4.296259, relative error: 2.531005e-07
numerical: -3.234310 analytic: -3.234309, relative error: 1.800654e-07
numerical: -2.801818 analytic: -2.801816, relative error: 4.538960e-07
numerical: -4.296261 analytic: -4.296259, relative error: 2.531005e-07
numerical: 2.892047 analytic: 2.892048, relative error: 1.4205

In [97]:
def mse_loss_vectorized(w, b, X, y, alpha=0):
    """
    MSE loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    - gradient with respect to bias b
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    #loss
    loss = (1/(y.shape[0])) *(((X@w) - y + b).T @ ((X@w) - y + b)) + alpha * (w.T@w)

    #dw
    dw = 2*alpha*w
    tmp = X.T@((X@w) - y + b)
    dw += (2/y.shape[0])*tmp 
    
    #db
    tmp = ((X@w - y) + b).sum()
    db = (2/y.shape[0]) * tmp

    return loss, dw, np.array(db).reshape(1,)


## Vectorised Linear regression loss

In [98]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)

sk_loss = mean_squared_error(X_train1 @ w1 + b1, y_train1)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Loss error :  6.256303831460837e-17
Gradient check w
numerical: -1.275043 analytic: -1.275043, relative error: 4.092561e-08
numerical: -4.296087 analytic: -4.296087, relative error: 2.130632e-08
numerical: -1.376394 analytic: -1.376394, relative error: 4.725760e-08
numerical: -1.275043 analytic: -1.275043, relative error: 4.092561e-08
numerical: -3.234125 analytic: -3.234125, relative error: 1.152227e-08
numerical: -1.553187 analytic: -1.553187, relative error: 6.103518e-08
numerical: -3.234125 analytic: -3.234125, relative error: 1.152227e-08
numerical: -3.153316 analytic: -3.153316, relative error: 6.991591e-09
numerical: -3.234125 analytic: -3.234125, relative error: 1.152227e-08
numerical: 2.892060 analytic: 2.892060, relative error: 4.838398e-09
numerical: -4.296087 analytic: -4.296087, relative error: 2.130632e-08
numerical: -1.275043 analytic: -1.275043, relative error: 4.092561e-08
numerical: -1.376394 analytic: -1.376394, relative error: 4.725760e-08
numerical: -1.275043 analy

## Vectorized ridge regression loss

In [99]:
loss, dw1, db1 = mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w1, dw1, 15,  error=1e-5)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b1: mse_loss_vectorized(w1, b1, X_train1, y_train1, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b1, db1, 15,  error=1e-5)

Gradient check w
numerical: -0.315505 analytic: -0.315505, relative error: 1.994834e-07
numerical: -0.315505 analytic: -0.315505, relative error: 1.994834e-07
numerical: -2.801816 analytic: -2.801816, relative error: 5.122305e-08
numerical: 2.892048 analytic: 2.892048, relative error: 2.541763e-08
numerical: -4.145108 analytic: -4.145108, relative error: 8.256742e-09
numerical: -1.275259 analytic: -1.275259, relative error: 7.672142e-08
numerical: -1.275259 analytic: -1.275259, relative error: 7.672142e-08
numerical: -3.234308 analytic: -3.234308, relative error: 1.575207e-09
numerical: -1.376516 analytic: -1.376516, relative error: 6.888701e-08
numerical: -2.801816 analytic: -2.801816, relative error: 5.122305e-08
numerical: -0.315505 analytic: -0.315505, relative error: 1.994834e-07
numerical: -1.552923 analytic: -1.552923, relative error: 5.312583e-08
numerical: -4.296258 analytic: -4.296258, relative error: 1.666341e-08
numerical: 2.892048 analytic: 2.892048, relative error: 2.5417

# Logistic regression

In [100]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

data = load_breast_cancer()
X_train2, y_train2 = data.data, data.target
w2 = np.random.randn(X_train2.shape[1]) * 0.0001
b2 = np.random.randn(1) * 0.0001

# Naive

In [101]:
def log_loss_naive(w, b, X, y, alpha=0):
    """
    log loss function WITH FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # dw
    for i in range(w.shape[0]):
        #dw[i] = 2/y.shape[0]*alpha*w[i]
        dw[i] = 2*alpha*w[i]
        tmp = 0
        for j in range(y.shape[0]):
            tmp += (sigmoid(w.T@X[j] + b) - y[j])*X[j][i]

        dw[i] += 1/y.shape[0]*tmp
    
    # loss
    for i in range(y.shape[0]):
        loss -= y[i]*np.log(sigmoid((w.T@X[i]) +b)) + (1-y[i])*np.log(1-sigmoid((w.T@X[i]) + b)) 
        
    loss = 1/y.shape[0]*loss
    
    for i in range(w.shape[0]):
        loss += alpha*(w[i]**2)
    
    
    # db
    tmp = 0
    for j in range(y.shape[0]):
        tmp += sigmoid((w.T@X[j]) + b) -y[j]
    db = (1/y.shape[0])*tmp
    
    return loss, dw, np.array(db).reshape(1,)

In [102]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  1.6025524267140178e-16
Gradient check w
numerical: -0.783596 analytic: -0.783596, relative error: 3.474829e-10
numerical: 25.461410 analytic: 25.461285, relative error: 2.453587e-06
numerical: 0.000254 analytic: 0.000254, relative error: 3.102982e-08
numerical: -0.000862 analytic: -0.000862, relative error: 1.385546e-08
numerical: 0.017422 analytic: 0.017422, relative error: 8.363594e-10
numerical: 25.461410 analytic: 25.461285, relative error: 2.453587e-06
numerical: -0.000862 analytic: -0.000862, relative error: 1.385546e-08
numerical: 6.132527 analytic: 6.132527, relative error: 1.785566e-08
numerical: -0.009087 analytic: -0.009087, relative error: 9.546370e-10
numerical: 6.132527 analytic: 6.132527, relative error: 1.785566e-08
numerical: 73.683412 analytic: 73.683019, relative error: 2.669651e-06
numerical: -0.011252 analytic: -0.011252, relative error: 1.351789e-09
numerical: -0.002936 analytic: -0.002936, relative error: 2.049920e-09
numerical: 73.683412 analytic: 

# Naive with regulariztion

In [103]:
loss, dw2, db2 = log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_naive(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -0.523745 analytic: -0.523745, relative error: 9.775764e-10
numerical: -0.783390 analytic: -0.783390, relative error: 3.492226e-10
numerical: -0.014601 analytic: -0.014601, relative error: 1.532692e-09
numerical: -0.001086 analytic: -0.001086, relative error: 1.594146e-08
numerical: -0.000068 analytic: -0.000068, relative error: 1.200627e-07
numerical: -0.028808 analytic: -0.028808, relative error: 7.353882e-11
numerical: -2.706662 analytic: -2.706663, relative error: 5.139840e-08
numerical: -0.011717 analytic: -0.011717, relative error: 1.191723e-09
numerical: -0.003331 analytic: -0.003331, relative error: 2.245939e-09
numerical: -0.014601 analytic: -0.014601, relative error: 1.532692e-09
numerical: 0.016921 analytic: 0.016921, relative error: 8.170188e-10
numerical: -4.482691 analytic: -4.482691, relative error: 1.759585e-08
numerical: -0.174697 analytic: -0.174697, relative error: 1.680508e-11
numerical: -1.882964 analytic: -1.882964, relative error: 2.87

# Vectorized

In [104]:
def log_loss_vectorized(w, b,X, y, alpha=0):
    """
    log loss function WITHOUT FOR LOOPs
    
    Returns a tuple of:
    - loss 
    - gradient with respect to weights w
    """
    loss = 0.0
    dw = np.zeros_like(w)
    db = 0.0
    
    # YOUR CODE HERE
    # loss
    loss -= ( np.multiply(y,np.log(sigmoid((X@w) + b))) + np.multiply(1-y , np.log(1-sigmoid((X@w) + b)))).sum()
    loss = (1/y.shape[0]) * loss
    loss += alpha*(w.T@w)
    
    # dw
    dw = 2*alpha*w
    tmp = X.T@(sigmoid(X@w + b) - y)
    dw += (1/y.shape[0])*tmp 
    
    #db
    db = (sigmoid(X@w +b) - y).sum()
    db = 1/y.shape[0]*db
    return loss, dw, np.array(db).reshape(1,)


In [105]:
y_pred_0 = sigmoid(X_train2 @ w2 + b2)
y_pred = np.vstack([1-y_pred_0, y_pred_0]).T
sk_loss = log_loss(y_train2, y_pred)

loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)
assert rel_error(loss, sk_loss) < 1e-9
print("Loss error : ",rel_error(loss, sk_loss))

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=0)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Loss error :  0.0
Gradient check w
numerical: 0.027196 analytic: 0.027196, relative error: 4.679166e-11
numerical: 0.013890 analytic: 0.013890, relative error: 5.502943e-11
numerical: -2.297823 analytic: -2.297823, relative error: 5.624153e-10
numerical: -2.297823 analytic: -2.297823, relative error: 5.624153e-10
numerical: 0.008644 analytic: 0.008644, relative error: 7.879709e-12
numerical: 0.027196 analytic: 0.027196, relative error: 4.679166e-11
numerical: 0.007423 analytic: 0.007423, relative error: 1.512329e-11
numerical: 0.027196 analytic: 0.027196, relative error: 4.679166e-11
numerical: 0.008644 analytic: 0.008644, relative error: 7.879709e-12
numerical: -0.028781 analytic: -0.028781, relative error: 2.034777e-10
numerical: -0.009087 analytic: -0.009087, relative error: 2.671376e-10
numerical: -2.297823 analytic: -2.297823, relative error: 5.624153e-10
numerical: -0.011252 analytic: -0.011252, relative error: 1.184730e-10
numerical: -0.783596 analytic: -0.783596, relative error

# Vectorized with regularization

In [106]:
loss, dw2, db2 = log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)

print("Gradient check w")
# Check with numerical gradient w
f = lambda w2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f, w2, dw2, 15, error=1e-4)

print("Gradient check bias")
# Check with numerical gradient b
f2 = lambda b2: log_loss_vectorized(w2, b2, X_train2, y_train2, alpha=1)[0]
grad_numerical = grad_check_sparse(f2, b2, db2, 15,  error=1e-5)

Gradient check w
numerical: -0.001086 analytic: -0.001086, relative error: 6.009283e-10
numerical: -0.008996 analytic: -0.008996, relative error: 1.986248e-11
numerical: -0.000358 analytic: -0.000358, relative error: 7.566356e-10
numerical: -0.001254 analytic: -0.001254, relative error: 2.971292e-11
numerical: -0.523745 analytic: -0.523745, relative error: 8.927854e-10
numerical: -0.000593 analytic: -0.000593, relative error: 1.538205e-09
numerical: -1.882964 analytic: -1.882964, relative error: 2.875395e-10
numerical: -0.000675 analytic: -0.000675, relative error: 7.328812e-10
numerical: 0.016921 analytic: 0.016921, relative error: 1.608892e-10
numerical: 0.007760 analytic: 0.007760, relative error: 1.096509e-10
numerical: 73.683441 analytic: 73.683048, relative error: 2.669649e-06
numerical: -1.882964 analytic: -1.882964, relative error: 2.875395e-10
numerical: -0.008996 analytic: -0.008996, relative error: 1.986248e-11
numerical: 6.132781 analytic: 6.132781, relative error: 1.785560

# Gradient descent for Linear models

In [125]:
class LinearModel():
    def __init__(self):
        self.w = None
        self.b = None

    def train(self, X, y, learning_rate=1e-3, alpha=0, num_iters=100, batch_size=200, verbose=False):
        N, d = X.shape
        
        if self.w is None: # Initialization
            self.w = 0.001 * np.random.randn(d)
            self.b = 0.0

        # Run stochastic gradient descent to optimize w
        
        loss_history = []
        for it in range(num_iters):
            X_batch = None
            y_batch = None
                                                               
            # Sample batch_size elements in X_batch and y_batch
            # X_batch shape is  (batch_size, d) and y_batch shape is (batch_size,)                                                                                          
            # Hint: Use np.random.choice to generate indices
            c = np.random.choice(N,batch_size,replace=False)
            X_batch = X[c,:]
            y_batch = y[c]
            # evaluate loss and gradient
            loss, dw, db = self.loss(X_batch, y_batch, alpha)
            loss_history.append(loss)

            # perform parameter update                                                                
            # Update the weights w and bias b using the gradient and the learning rate.          
            self.w -= learning_rate*dw
            self.b -= learning_rate*db
            
            if verbose and it % 10000 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))
                
        return loss_history

    def predict(self, X):
        pass

    def loss(self, X_batch, y_batch, reg):
        pass

class LinearRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return mse_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        return X@self.w + self.b

class LogisticRegressor(LinearModel):
    """ Linear regression """

    def loss(self, X_batch, y_batch, alpha):
        return log_loss_vectorized(self.w, self.b, X_batch, y_batch, alpha)
    
    def predict(self, X):
        """ Return prediction labels vector of 0 or 1 """
        prob = X@self.w + self.b
        prob[prob>0] = 1
        prob[prob<=0] = 0
        return prob


## Linear regression with gradient descent

In [120]:
from sklearn.linear_model import LinearRegression

sk_model = LinearRegression(fit_intercept=True)
sk_model.fit(X_train1, y_train1)
sk_pred = sk_model.predict(X_train1)
sk_mse = mean_squared_error(sk_pred, y_train1)

model = LinearRegressor()
model.train(X_train1, y_train1, num_iters=75000, batch_size=64, learning_rate=1e-2, verbose=True)
pred = model.predict(X_train1)
mse = mean_squared_error(pred, y_train1)

print("MSE scikit-learn:", sk_mse)
print("MSE gradient descent model :", mse)
assert mse - sk_mse < 100

iteration 0 / 75000: loss 35326.999353
iteration 10000 / 75000: loss 3709.200718
iteration 20000 / 75000: loss 3194.399948
iteration 30000 / 75000: loss 3549.434611
iteration 40000 / 75000: loss 2854.799561
iteration 50000 / 75000: loss 3511.380522
iteration 60000 / 75000: loss 3144.369632
iteration 70000 / 75000: loss 2763.857816
MSE scikit-learn: 2859.6903987680657
MSE gradient descent model : 2884.281794602913


## Logistc regression with gradient descent

In [121]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)

sk_model = LogisticRegression(fit_intercept=True)
sk_model.fit(X_train2, y_train2)
sk_pred = sk_model.predict(X_train2)
sk_log_loss = log_loss(sk_pred, y_train2)

model = LogisticRegressor()
model.train(X_train2, y_train2, num_iters=75000, batch_size=64, learning_rate=1e-3, verbose=True)
pred = model.predict(X_train2)
model_log_loss = log_loss(pred, y_train2)

print("Log-loss scikit-learn:", sk_log_loss)
print("Log-loss gradiet descent model :", model_log_loss)
print("Error :", rel_error(sk_log_loss, model_log_loss))
assert rel_error(sk_log_loss, model_log_loss) < 1e-7

iteration 0 / 75000: loss 0.692918
iteration 10000 / 75000: loss 0.154828
iteration 20000 / 75000: loss 0.042210
iteration 30000 / 75000: loss 0.078774
iteration 40000 / 75000: loss 0.103919
iteration 50000 / 75000: loss 0.069588
iteration 60000 / 75000: loss 0.046657
iteration 70000 / 75000: loss 0.124945
Log-loss scikit-learn: 0.4249086712816093
Log-loss gradiet descent model : 0.4249086712816093
Error : 0.0
