# Logistic Regression <a class="tocSkip">

In [None]:
# Import statements
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

%matplotlib inline
%load_ext nb_black

# Motivation

In [None]:
# Read the data
iris = pd.read_csv(
    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
)
iris = iris[["species", "sepal_length", "sepal_width"]]
iris = iris[iris.species != "virginica"]
iris = iris.dropna()

In [None]:
# Scatter plot using pandas
ax = iris[iris.species == "setosa"].plot.scatter(
    x="sepal_length", y="sepal_width", color="red", label="setosa"
)
iris[iris.species == "versicolor"].plot.scatter(
    x="sepal_length", y="sepal_width", color="green", label="versicolor", ax=ax
)
ax.set_title("Scatter Plot")
plt.show()

The goal is to classify between the two species "setosa" and "versicolor".

# Logistic Regression viewed as a Perceptron

Logistic regression has two trainable parameters: a weight $W$ and a bias $b$. For a vector of features $X$, the prediction of logistic regression is given by

$$
f(X) = \frac{1}{1 + \exp(-[XW + b])} = \sigma(h(X))
$$
where $\sigma(z) = \frac{1}{1 + \exp(-z)}$ and $h(X)=XW + b$.

Parameters $W$ and $b$ are fitted by maximizing the log-likelihood (or minimizing the negative log-likelihood) of the model on the training data. For a training subset $\{X_j, Y_j\}_{j=1}^N$ the normalized negative log likelihood (NLL) is given by 

$$
\mathcal{L} = -\frac{1}{N}\sum_j \log\Big[ f(X_j)^{Y_j} \cdot (1-f(X_j))^{1-Y_j}\Big]
= -\frac{1}{N}\sum_j \Big[ Y_j\log f(X_j) + (1-Y_j)\log(1-f(X_j))\Big]
$$

The following algorithm is used for the **forward** pass:

1. Linear mapping: $h=XW + b$
2. Sigmoid activation function: $f=\sigma(h)$
3. Calculation of NLL: $\mathcal{L} = -\frac{1}{N}\sum_j \Big[ Y_j\log f_j + (1-Y_j)\log(1-f_j)\Big]$

In order to fit $W$ and $b$ we perform Gradient Descent. We choose a small learning rate $\gamma$ and after each computation of forward pass, we update the parameters 

$$W_{\text{new}} = W_{\text{old}} - \gamma \frac{\partial \mathcal{L}}{\partial W}$$

$$b_{\text{new}} = b_{\text{old}} - \gamma \frac{\partial \mathcal{L}}{\partial b}$$

We use Backpropagation method to calculate the partial derivatives of the loss function with respect to the parameters of the model.

$$
\frac{\partial\mathcal{L}}{\partial W} = 
\frac{\partial\mathcal{L}}{\partial h} \frac{\partial h}{\partial W} =
\frac{\partial\mathcal{L}}{\partial f} \frac{\partial f}{\partial h} \frac{\partial h}{\partial W}
$$

$$
\frac{\partial\mathcal{L}}{\partial b} = 
\frac{\partial\mathcal{L}}{\partial h} \frac{\partial h}{\partial b} =
\frac{\partial\mathcal{L}}{\partial f} \frac{\partial f}{\partial h} \frac{\partial h}{\partial b}
$$

In [None]:
def linear_forward(x_input, W, b):
    output = np.matmul(x_input, W) + b
    return output

In [None]:
def linear_grad_W(x_input, grad_output, W, b):
    grad_W = np.matmul(x_input.T, grad_output)
    return grad_W

In [None]:
def linear_grad_b(x_input, grad_output, W, b):
    grad_b = np.sum(grad_output.T)
    return grad_b

In [None]:
def sigmoid_forward(x_input):
    output = 1 / (1 + np.exp(-x_input))
    return output

Calculation of the partial derivative of the loss function with respect to the input of sigmoid. 

$$
\frac{\partial \mathcal{L}}{\partial h} = 
\frac{\partial \mathcal{L}}{\partial f}
\frac{\partial f}{\partial h} 
$$

Tensor $\frac{\partial \mathcal{L}}{\partial f}$ comes from the loss function. Let's calculate $\frac{\partial f}{\partial h}$

$$
\frac{\partial f}{\partial h} = 
\frac{\partial \sigma(h)}{\partial h} =
\frac{\partial}{\partial h} \Big(\frac{1}{1 + e^{-h}}\Big)
= \frac{e^{-h}}{(1 + e^{-h})^2}
= \frac{1}{1 + e^{-h}} \frac{e^{-h}}{1 + e^{-h}}
= f(h) (1 - f(h))
$$

In [None]:
def sigmoid_grad_input(x_input, grad_output):
    grad_input = sigmoid_forward(x_input) * (1 - sigmoid_forward(x_input)) * grad_output
    return grad_input

In [None]:
def nll_forward(target_pred, target_true):
    assert len(target_pred) == len(target_true)
    s = np.sum(
        [
            j * np.log(i) + (1 - j) * np.log(1 - i)
            for i, j in zip(target_pred, target_true)
        ]
    )
    output = -1 / len(target_true) * s
    return output

In [None]:
def nll_grad_input(target_pred, target_true):
    grad_input = (
        1
        / len(target_pred)
        * (target_pred - target_true)
        / (target_pred * (1 - target_pred))
    )
    return grad_input

In [None]:
class LogsticRegressionGD(object):
    def __init__(self, n_in, lr=0.05):
        super().__init__()
        self.lr = lr
        self.b = np.zeros(1)
        self.W = np.random.randn(n_in, 1)

    def forward(self, x):
        self.h = linear_forward(x, self.W, self.b)
        y = sigmoid_forward(self.h)
        return y

    def update_params(self, x, nll_grad):
        # compute gradients
        grad_h = sigmoid_grad_input(self.h, nll_grad)
        grad_W = linear_grad_W(x, grad_h, self.W, self.b)
        grad_b = linear_grad_b(x, grad_h, self.W, self.b)
        # update params
        self.W = self.W - self.lr * grad_W
        self.b = self.b - self.lr * grad_b

# Applying the model to the Example

In [None]:
# Encode species names
# setosa == 1 versicolor == 0
iris.loc[iris["species"] == "setosa", "species"] = 1
iris.loc[iris["species"] == "versicolor", "species"] = 0

In [None]:
# Prepare data
# Prepare the data
X = np.array(iris.drop(["species"], axis=1))
y = np.array(np.matrix(iris["species"]).T)

In [None]:
model = LogsticRegressionGD(2, 0.05)
number_of_itterations = 30000

for step in range(number_of_itterations):
    y_pred = model.forward(X)

    loss_value = nll_forward(y_pred, y)
    accuracy = ((y_pred > 0.5) == y).mean()
    loss_grad = nll_grad_input(y_pred, y)
    model.update_params(X, loss_grad)

In [None]:
# Plotting the result
def plot_model_prediction(prediction_func, X, Y):
    u_min = X[:, 0].min() - 1
    u_max = X[:, 0].max() + 1
    v_min = X[:, 1].min() - 1
    v_max = X[:, 1].max() + 1

    U, V = np.meshgrid(np.linspace(u_min, u_max, 100), np.linspace(v_min, v_max, 100))
    UV = np.stack([U.ravel(), V.ravel()]).T
    c = prediction_func(UV).ravel()
    c = c > 0.5
    plt.scatter(UV[:, 0], UV[:, 1], c=c, edgecolors="none", alpha=0.15)
    plt.scatter(X[:, 0], X[:, 1], c=Y.ravel(), edgecolors="black")
    plt.xlim(left=u_min, right=u_max)
    plt.ylim(bottom=v_min, top=v_max)
    plt.axes().set_aspect("equal")
    plt.show()


plot_model_prediction(lambda x: model.forward(x), X, y)