In [1]:
import numpy as np

#### Data

In [2]:
X = np.array([2, 4, 7, 9])
T = np.array([3, 5, 5, 6])

#### MSE

In [3]:
def mse(T, Y):
    """
    Returns mean squared error.
    
    Y is a vector of predictions
    T is a vector of true answers
    """
    
    # We could instead do:
    # import from sklearn.metrics import mean_squared_error as mse
    return ((Y - T) ** 2).sum() / len(Y)

# Batch Gradient Descent

In [4]:
def BGD(X, T, epochs=100, lr=0.01):
    """
    Batch gradient descent for a linear regression model y = kx + b
    with only one input feature.
    
    X is a vector of all training set inputs
    T is a vector of all respective true answers
    lr stands for learning rate
    epochs is a number of iterations over all samples of training set
    """
    
    # Initializing slope and intercept with random values
    k, b = np.random.sample(2)
    # Number of training samples
    n = len(X)

    # Training the model
    for _ in range(epochs):
        # Calculating our linear regression prediction vector
        # so that we could later use it when find gradient
        # of the cost function over all training samples
        Y = k * X + b
        
        # Computing gradient itself
        delta_k = (2 * X * (Y - T)).sum() / n
        delta_b = (2 * (Y - T)).sum() / n
        
        # Updating slope and intercept by substracting gradient
        # multiplied by learning rate
        k -= lr * delta_k
        b -= lr * delta_b
    
    # We pass k*X+b instead of just passing Y because we want
    # to see the error calculated for the latest parameters k and b
    # And Y is computed before updating parameters
    print(f'MSE: {mse(T, k*X+b)}')
    
    return k, b

In [5]:
BGD(X, T, epochs=1000)

MSE: 0.23734691216242063


(0.3676579468340696, 2.720706302376377)

# Stochastic Gradient Descent

In [6]:
def SGD(X, T, epochs=100, lr=0.01, batch_size=1):
    """
    Note that for every iteration we randomly choose a batch of samples
    and move on to next iteration. We don't choose consecutive bathes
    from training set till it ends. That's why we could train our linear
    regression model more on some samples and less on the others.
    
    Changing batch_size to the value other than 1 turns SGD
    into mini-batch gradient descent.
    
    If we were to increase batch_size up to the length of our training set,
    that wouldn't mean SGD turns into BGD because indexes are chosen randomly
    and the same index can appear multiple times in our array of indexes.
    That's why we might train our model multiple times on the same samples
    during a single iteration.
    
    Moreover we could increase batch_size to the value bigger than the length
    of our training set.
    
    Since we're updating our weights for each training sample, the output
    of our model changes for each sample as well (because it uses new k
    and b values). Hence each sample cost's gradient is calculated with different
    model's output y in contrast to BGD where each sample cost's gradient before
    it's summed is calculated with the same output y of the model.
    
    That's why even if we were to consecutively take inputs from training set
    till the whole set is used we wouldn't get the same gradient for the set
    as we do when we use BGD (because of updating weights for each sample which
    causes our model's output to change before we use this output in the next
    sample cost's gradient calculation).
    """
    
    # Initializing slope and intercept with random values
    k, b = np.random.sample(2)

    # Training the model
    for _ in range(epochs):
        # Forming an array of batch_size length of random indexes
        # where each index is in interval [0; len(X))
        indexes = np.random.randint(0, len(X), batch_size)
        
        # Choosing a batch of random inputs
        batch = np.take(X, indexes)
        # Choosing respective answers
        answers = np.take(T, indexes)
        
        # Calculating our linear regression prediction vector
        # so that we could later use it when find gradient
        # of the cost function over our batch samples
        Y = k * batch + b
        
        # Computing gradient itself
        delta_k = (2 * batch * (Y - answers)).sum() / batch_size
        delta_b = (2 * (Y - answers)).sum() / batch_size
        
        # Updating slope and intercept by substracting gradient
        # multiplied by learning rate
        k -= lr * delta_k
        b -= lr * delta_b
    
    print(f'MSE: {mse(T, k*X+b)}')
    
    return k, b

In [7]:
SGD(X, T, epochs=1000, batch_size=2)

MSE: 0.2898347910172201


(0.4134318648443603, 2.6595348211929317)