In [1]:
import pandas as pd
import numpy as np

np.random.seed(7)

In [2]:
def genX(n):
    np.random.seed(7)
    x1 = np.random.uniform(-5, 5, n)
    x2 = np.random.uniform(-2, 2, n)
    x0 = np.ones(n)
    df = pd.DataFrame({'x0': x0, 'x1': x1, 'x2': x2})
    return x0, x1, x2, df

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [3]:
def gen_data(n):
    np.random.seed(7)
    x0, x1, x2, x_dataframe = genX(n)

    linear_y = 3 * x0 + 1 * x1 - 2 * x2 + np.random.normal(0, 0.05, n)

    y = np.random.binomial(1, sigmoid(linear_y))

    return x_dataframe, y

In [4]:
def predict_row(row, coefficients):

    pred_terms = np.multiply(row, coefficients)
    yhat = np.sum(pred_terms)
    return 1 / (1 + np.exp(-yhat))

In [5]:
def mse(y, yhat):
    return np.mean((y - yhat)**2)

def nll(y, yhat):
    return -np.mean(y * np.log(yhat) + (1 - y) * np.log(1 - yhat))

In [6]:
n = 1000
X, y = gen_data(n)

# Random guess

In [7]:
def true_model():
    print('True model:')
    y_pred = np.zeros(n)

    for row in range(n):
        y_pred[row] = predict_row(np.array(X.iloc[row]), np.array([3,1,-2]))

    print(f'MSE: {mse(y, y_pred)}')
    print(f'NLL: {nll(y, y_pred)}')

In [8]:
y_pred = np.zeros(n)

for row in range(n):
    y_pred[row] = predict_row(np.array(X.iloc[row]), np.array([0,0.5,1]))

print(f'MSE: {mse(y, y_pred)}')
print(f'NLL: {nll(y, y_pred)}')

print()
true_model()

MSE: 0.2996565695865238
NLL: 0.9081213170722178

True model:
MSE: 0.06831581573421616
NLL: 0.21844118029674345


# Logistic Regression

The partial derivative of the NLL with respect to a coefficient ($ \beta_k\ $) is given by:

$$
\frac{\partial \text{NLL}}{\partial \beta_k} = (\hat{y} - y) \cdot x_k
$$


This derivative indicates how much $ \beta_k  $ should change to reduce the error.

##### Update Rule:
Using the learning rate ($\eta$), the update rule for the coefficients is:

$$
\beta_k \gets \beta_k - \eta \cdot \frac{\partial \text{NLL}}{\partial \beta_k}
$$

Substituting the derivative:

$$
\beta_k \gets \beta_k - \eta \cdot (\hat{y} - y) \cdot x_k
$$


In [9]:
# for sake of testing
i = 1
coefs = np.zeros(3) # (beta_0,beta_1,beta_2)
l_rate = 0.01

In [10]:
# Predict the outcome for row i
yhat_i = predict_row(np.array(X.iloc[i]), coefs)

# Update each coefficient
for k in range(len(coefs)):
    coefs[k] = coefs[k] - l_rate * (yhat_i - y[i]) * X.iloc[i, k]

In [11]:
coefs

array([0.005     , 0.01399594, 0.00052306])

# Stochastic gradient descent

In [16]:
coefs = np.zeros(3) # (beta_0,beta_1,beta_2)
l_rate = 0.01
epochs = 10

for _ in range(epochs):
    i = np.arange(n)
    np.random.shuffle(i)
    for index in i:
        # Predict the outcome for row i
        yhat_i = predict_row(np.array(X.iloc[index]), coefs)

        # Update each coefficient
        for k in range(len(coefs)):
            coefs[k] = coefs[k] - l_rate * (yhat_i - y[index]) * X.iloc[index, k]

coefs

array([ 2.87705294,  1.11426409, -1.80516914])

# Convert our estimator into a function!