In [14]:
from __future__ import division
import numpy as np

try:
    xrange
except NameError:
    xrange = range

def add_intercept(X_):
    m, n = X_.shape
    X = np.zeros((m, n + 1))
    X[:, 0] = 1
    X[:, 1:] = X_
    return X

def load_data(filename):
    D = np.loadtxt(filename)
    Y = D[:, 0]
    X = D[:, 1:]
    return add_intercept(X), Y

def calc_grad(X, Y, theta):
    m, n = X.shape
    grad = np.zeros(theta.shape)
    margins = Y * X.dot(theta)
    probs = 1. / (1 + np.exp(margins))
    grad = -(1./m) * (X.T.dot(probs * Y))
    return grad

def loss(X, Y, theta):
    minus1_pred = 1. / (1 + np.exp(X.dot(theta)))
    pos1_pred = 1. / (1 + np.exp(- X.dot(theta)))
    y_pred = np.ones_like(minus1_pred)
    y_pred[pos1_pred < minus1_pred] = -1
    return y_pred, y_pred[y_pred != Y].size

def logistic_regression(X, Y):
    m, n = X.shape
    theta = np.zeros(n)
    learning_rate = 10
    print(Y)
    i = 0
    while True:
        i += 1
        prev_theta = theta
        grad = calc_grad(X, Y, theta)
        theta = theta  - learning_rate * (grad)
        norm = np.linalg.norm(prev_theta - theta)
        
        if i % 10000 == 0:
            y_pred, los = loss(X, Y, theta)
            print(y_pred)
            print('Finished {0} iterations; Loss: {4}; Diff theta: {1}; theta: {2}; Grad: {3}'.format(
                i, norm, theta, grad, los))
        if i % 300000 == 0:
            break
        if norm < 1e-15:
            print('Converged in %d iterations' % i)
            break
    return

In [16]:
def main():
    print('==== Training model on data set A ====')
    Xa, Ya = load_data('dataset/data_a.txt')
    logistic_regression(Xa, Ya)

    print('==== Training model on data set B ====')
    #Xb, Yb = load_data('dataset/data_b.txt')
    #logistic_regression(Xb, Yb)

    return

if __name__ == '__main__':
    main()

==== Training model on data set A ====
[-1.  1. -1. -1.  1.  1. -1.  1. -1. -1.  1. -1. -1. -1. -1. -1. -1.  1.
  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1.
  1.  1. -1.  1.  1. -1.  1.  1. -1. -1. -1.  1. -1.  1.  1.  1. -1. -1.
 -1.  1.  1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1. -1.  1.  1.  1.  1.
 -1. -1. -1.  1.  1.  1.  1. -1. -1. -1. -1.  1. -1.  1.  1. -1.  1.  1.
  1. -1.  1.  1.  1.  1. -1.  1.  1. -1.]
[-1.  1. -1. -1.  1.  1. -1.  1. -1. -1.  1. -1. -1. -1. -1. -1. -1. -1.
  1.  1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1.  1.  1.  1. -1. -1.  1.
  1.  1. -1. -1.  1. -1.  1.  1.  1. -1. -1.  1. -1.  1.  1.  1. -1. -1.
 -1.  1.  1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1. -1.  1.  1.  1. -1.
 -1. -1. -1.  1.  1.  1.  1. -1. -1. -1. -1.  1. -1.  1.  1. -1. -1.  1.
  1. -1. -1.  1.  1.  1. -1.  1.  1. -1.]
Finished 10000 iterations; Loss: 8; Diff theta: 7.226491864936692e-07; theta: [-20.81394174  21.45250215  19.85155266]; Grad: [ 4.15154546e-08 -4.2

## 1. Training Stability
Run the given logistic regression code to train two different models on A and B.
#### a. What's the most notable difference in training on datasets A and B:
A will be converged rapidly whereas B not; The parameters are also abnormal which is up to several hundreds.
#### b. Find the problem.
---
Note that actually the algorithm used in the code is different from what we learned in the lecture. The update rule for $\theta$ here is
$$\theta := \theta + \frac{1}{m} x^{(i)}y^{(i)}\frac{1}{1 + \exp^{y^{(i)} \theta^T x^{(i)}}}$$
The assumption is stronger than the common logistic regression so the training process is faster.
Givin the posterior probability $$p(y|x;\theta) = \frac{1}{1 + \exp(-y\theta^T x)}$$
Proof:
$$\begin{align*}
L(\theta) & = \prod p(y^{(i)}|x^{(i)}; \theta) \\
\ell (\theta) & = - \sum \log(1 + \exp(-y^{(i)} \theta^T x^{(i)})) \\
\frac{\partial}{\partial} \ell(\theta) & = - \sum \frac{1}{1 + \exp(-y^{(i)} \theta^T x^{(i)})} \exp(-y^{(i)} \theta^T x^{(i)}) (-y^{(i)} x^{(i)}) \\
& = \sum \frac{y^{(i)} x^{(i)}}{1 + \exp(y^{(i)} \theta^T x^{(i)})}
\end{align*}$$
After calculate the loss, it's shown that in the dataset B, the prediction is already absolute correct but the grad is still large; Whereas in dataset A, error is always existing but the grad is small and could be ignored finally.
It also shown that for a "perfect" dataset such as B, there are infinite number of solution for this regression problem, it can increase the likelihood by simply scaling $\theta$.

#### c. For each possible modifications, state its feasibility. 
1. Using a different constant learning rate
    - No
2. Decreasing the learning rate over time(e.g. scaling the initial learning rate by $1/t^2$ where $t$ is the number of iterations thus far)
    - Yes
3. Adding a regularization term $||\theta||_2^2$ to the loss function.
    - Yes
4. Linear scaling of the input features.
    - No
5. Adding zero-mean Gaussian noise to the training data or labels.
    - Yes

#### d. Are SVM vulnerable to datasets like B? Why or Why not?
Dataset B won't cause a problem for SVM because in SVM, it tries to maximize the geometric margin, which is already been "normalized" then won't be affected by scaling $\left|\left|\theta\right|\right|$.