In [1]:
import numpy as np

In [21]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    return 1 / (1 + np.exp(-z))

# Gradient Descent function
Implement gradient descent function.
* The number of iterations 'num_iters" is the number of times that you'll use the entire training set.
* For each iteration, you'll calculate the cost function using all training examples (there are 'm' training examples), and for all features.
* Instead of updating a single weight $\theta_i$ at a time, we can update all the weights in the column vector:  
$$\mathbf{\theta} = \begin{pmatrix}
\theta_0
\\
\theta_1
\\ 
\theta_2 
\\ 
\vdots
\\ 
\theta_n
\end{pmatrix}$$
* $\mathbf{\theta}$ has dimensions (n+1, 1), where 'n' is the number of features, and there is one more element for the bias term $\theta_0$ (note that the corresponding feature value $\mathbf{x_0}$ is 1).
* The 'logits', 'z', are calculated by multiplying the feature matrix 'x' with the weight vector 'theta'.  $z = \mathbf{x}\mathbf{\theta}$
    * $\mathbf{x}$ has dimensions (m, n+1) 
    * $\mathbf{\theta}$: has dimensions (n+1, 1)
    * $\mathbf{z}$: has dimensions (m, 1)
* The prediction 'h', is calculated by applying the sigmoid to each element in 'z': $h(z) = sigmoid(z)$, and has dimensions (m,1).
* The cost function $J$ is calculated by taking the dot product of the vectors 'y' and 'log(h)'.  Since both 'y' and 'h' are column vectors (m,1), transpose the vector to the left, so that matrix multiplication of a row vector with column vector performs the dot product.
$$J = \frac{-1}{m} \times \left(\mathbf{y}^T \cdot log(\mathbf{h}) + \mathbf{(1-y)}^T \cdot log(\mathbf{1-h}) \right)$$
* The update of theta is also vectorized.  Because the dimensions of $\mathbf{x}$ are (m, n+1), and both $\mathbf{h}$ and $\mathbf{y}$ are (m, 1), we need to transpose the $\mathbf{x}$ and place it on the left in order to perform matrix multiplication, which then yields the (n+1, 1) answer we need:
$$\mathbf{\theta} = \mathbf{\theta} - \frac{\alpha}{m} \times \left( \mathbf{x}^T \cdot \left( \mathbf{h-y} \right) \right)$$

In [25]:
def gradientDescent(X, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    m = len(y)
    
    for i in range(num_iters):
        
        z = np.dot(X, theta) # dot product of X and theta
        
        yhat = sigmoid(z)
        
        J = -1/m * np.sum(yhat*np.log(yhat) + (1-y)*np.log(1-yhat)) # cost function
        print(J)

        theta = theta - (alpha/m * np.dot(X.T, yhat-y)) # update the weights theta

    J = float(J)
    return J, theta

In [29]:
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
J, theta = gradientDescent(X, y, np.zeros((3, 1)), 1e-8, 9999)
print(f"\nFinal cost after training: {J:.8f}.")
print(f"Resulting vector of weights: {[round(t, 8) for t in np.squeeze(theta)]}")

0.6238324625039509
0.6239503492468924
0.6240677392757112
0.6241846347083789
0.6243010376522955
0.6244169502043517
0.6245323744509914
0.624647312468273
0.6247617663219309
0.6248757380674368
0.6249892297500597
0.6251022434049269
0.6252147810570837
0.6253268447215532
0.6254384364033949
0.6255495580977642
0.625660211789971
0.6257703994555377
0.625880123060257
0.6259893845602491
0.62609818590202
0.6262065290225167
0.6263144158491851
0.6264218483000252
0.6265288282836473
0.6266353576993271
0.6267414384370613
0.6268470723776218
0.6269522613926105
0.6270570073445132
0.6271613120867532
0.627265177463745
0.6273686053109471
0.627471597454915
0.6275741557133532
0.6276762818951674
0.6277779778005171
0.6278792452208651
0.6279800859390305
0.6280805017292383
0.6281804943571706
0.6282800655800157
0.6283792171465188
0.6284779507970314
0.6285762682635596
0.6286741712698141
0.6287716615312576
0.6288687407551539
0.6289654106406154
0.6290616728786506
0.6291575291522115
0.6292529811362402
0.6293480304977161
