### The sigmoid function
$$\boxed{\phi(z) = \frac{1}{1+ e^{-z}}}$$

In [1]:
import matplotlib.pyplot as plt 
import numpy as np
def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))
z = np.arange(-7, 7, 0.1)
phi_z = sigmoid(z)
plt.plot(z, phi_z)
plt.axvline(0.0, color = 'k')
plt.ylim(-0.1, 1.1)
plt.xlabel('z')
plt.ylabel('$\phi (z)$')
plt.yticks([0.0, 0.5, 1.0])
ax = plt.gca()
ax.yaxis.grid(True)
plt.show()

<Figure size 640x480 with 1 Axes>

### Logistic regrsssion
Logistic regression is a classification model. Asumme that we are dealing with a binary classification problem  where all data points are in $\mathbb{R}^d$ and their labels are either $0$ or $1$. Given a weight vector $w \in \mathbb{R}^d$, we can define the probability that an arbitrary data point $x$ belong to class 1 by
$$\mathbb{P}(y=1| x,w) = \phi (w^Tx) = \frac{1}{1 + e^{-w^Tx}}.$$
The probability that $x$ belongs to class $0$ is then 
$$\mathbb{P}(y=0| x,w) = 1- \phi (w^Tx) = \frac{1}{1 + e^{w^Tx}}.$$
The outcome of the logistic regression is then 
$$\hat{y} = argmax\{\mathbb{P}(y=0| x,w), \mathbb{P}(y=1| x,w)\}.$$
### Learning the weight of the logistic cost function
In order to find the optimal weight $w^*$, we minimize the following cost function
$$ \boxed{J(w) = \sum_{i=1}^{n}\left[-y_i\log(\phi(z_i))- (1-y_i)\log(1-\phi(z_i))\right].}$$
We compute the gradient  and the Hessian of $J$. We have 
\begin{align*}
\nabla_{w} J(w)& = -\sum_{i = 1}^{n}\left[y_i\frac{1}{\phi(z_i)}\nabla_{w}\phi(z_i)-(1-y_i)\frac{1}{1-\phi(z_i)}\nabla_w\phi(z_i)\right]\\
& = -\sum_{i=1}^{n}\left[y_i\frac{1}{\phi(z_i)}-(1-y_i)\frac{1}{1-\phi(z_i)}\right]\phi'(z_i)x_i\\
& = -\sum_{i=1}^{n}\left[y_i\frac{1}{\phi(z_i)}-(1-y_i)\frac{1}{1-\phi(z_i)}\right]\phi(z_i)(1- \phi(z_i))x_i\\
& = -\sum_{i=1}^n[y_i - \phi(z_i)]x_i.
\end{align*}
Thus formula for the gradient of $J$ is 
$$\boxed{\nabla_w J(w) = \sum_{i=1}^n[\phi(z_i)-y_i]x_i = X^T(\phi-y).}$$
where 
$$
X = \begin{bmatrix}
x_1^T\\
\vdots\\
x_n^T
\end{bmatrix},
\hspace{1cm}
y = \begin{bmatrix}
y_1\\
\vdots\\
y_n
\end{bmatrix},
\hspace{1cm}
\begin{bmatrix}
\phi(z_1)\\
\vdots\\
\phi(z_n)
\end{bmatrix}.
$$
In a similar way, we obtain the following formula for the Hessian of $J$ 
$$
\boxed{\nabla^2_{w} J(w) = \sum_{i=1}^m\phi(z_i)(1-\phi(z_i))x_ix_i^T = Xdiag\{\phi\odot(1-\phi)\}X^T.}
$$
By applying the gradient descent  with stepsize $\eta$, we obtain the following update rule

\begin{align*}
w_0 &= \hat{w}\\
w_{t+1} &= w_t - \eta X^T(\phi-y).
\end{align*}

In [2]:
from sklearn import datasets
import numpy as np

In [3]:
iris = datasets.load_iris()
X = iris.data[:, [2,3]]
y = iris.target

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1, stratify = y)

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [6]:
X_train_std

array([[-1.33269725, -1.30728421],
       [-1.16537974, -1.30728421],
       [ 0.84243039,  1.44587881],
       [ 1.0655204 ,  1.18367281],
       [-1.44424226, -1.30728421],
       [ 1.0097479 ,  1.57698181],
       [ 1.56747294,  1.18367281],
       [-1.44424226, -1.30728421],
       [ 1.12129291,  0.5281578 ],
       [ 0.45202286,  0.1348488 ],
       [-0.27301968, -0.2584602 ],
       [ 0.06161534,  0.2659518 ],
       [-1.38846976, -1.30728421],
       [ 0.50779537,  0.0037458 ],
       [ 0.11738784,  0.1348488 ],
       [ 0.73088538,  0.92146681],
       [-1.05383474, -1.30728421],
       [-0.16147468, -0.2584602 ],
       [ 0.06161534,  0.0037458 ],
       [-1.22115225, -1.30728421],
       [ 0.56356787,  0.79036381],
       [ 1.73479045,  1.44587881],
       [ 0.39625036,  0.3970548 ],
       [ 0.39625036,  0.1348488 ],
       [ 0.00584283, -0.1273572 ],
       [ 1.0097479 ,  1.57698181],
       [ 0.50779537,  0.2659518 ],
       [ 1.0097479 ,  0.2659518 ],
       [ 1.12129291,

In [7]:
class LogisticRegressionGD(object):
    """""
    Short explaination of the class
    """""
    
    def __init__(self, eta = 0.05, n_iter = 100, random_state = 1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state
        
    def fit(self, X, y):
        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc = 0.0, scale = 0.01, size = 1 +X.shape[1])
        self.cost_ = []
        
        for i in range(self.n_iter):
            net_input = self.net_input(X)
            output = self.activation(net_input)
            errors = y - output
            self.w_[1:] += self.eta * X.T.dot(errors)
            self.w_[0] += self.eta * errors.sum()
            cost = -y.dot(np.log(output)) -(1-y).dot(np.log(1-output))
            self.cost_.append(cost)
        return self
    
    def net_input(self, X):
        return np.dot(X,self.w_[1:]) + self.w_[0]
    
    def activation(self, z):
        return 1./(1. + np.exp(-np.clip(z, -250, 250)))
    
    def predict(self, X):
        return np.where(self.net_input(X) >= 0.0, 1, 0)

In [8]:
X_train_01_subset = X_train[(y_train == 0) | (y_train == 1)]
y_train_01_subset = y_train[(y_train == 0) | (y_train == 1)]
lrgd = LogisticRegressionGD(eta=0.05, n_iter=1000, random_state=1)
lrgd.fit(X_train_01_subset, y_train_01_subset)
y_pre = lrgd.predict(X_train_01_subset)

In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true = y_train_01_subset, y_pred = y_pre)

1.0

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100.0, random_state = 1, solver= 'lbfgs', multi_class = 'auto')
lr.fit(X_train_std, y_train)
y_hat = lr.predict(X_train_std)
accuracy_score(y_true = y_train, y_pred = y_hat)

0.9523809523809523