# Exercise 2: Logistic Regression 

In [3]:
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pdb

## 1. Data preparation

In [4]:
cancer = datasets.load_breast_cancer()

In [6]:
X = cancer.data
y = cancer.target
m, n = X.shape
print(m, n)

569 30


In [7]:
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [23]:
# feature scaling
def feature_normalize(X):
    X_norm = X.copy()
    mu = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])
    
    mu = np.mean(X, axis = 0)
    sigma = np.std(X, axis = 0)
    X_norm = (X - mu) / sigma
    
    return X_norm, mu, sigma

In [24]:
X, mu, sigma = feature_normalize(X)
#print('mean:', mu)
#print('standard deviation:', sigma)

In [25]:
# add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

## 2. Sigmoid Function: $g(z) = \frac{1}{1 + e^{-z}}$

In [26]:
def sigmoid(z):
    z = np.array(z)
    g = 1 / (1 + np.exp(-z))
    return g

In [31]:
sigmoid(36)
sigmoid(37)

1.0

In [30]:
print(sigmoid(-709))
print(sigmoid(-710))

1.216780750623423e-308
0.0


In [29]:
np.log(0)

  """Entry point for launching an IPython kernel.


-inf

## 3. Cost Function: $J(\theta)=\frac{1}{m} \sum_{i=1}^{m}\left[-y^{(i)} \log \left(h_{\theta}\left(x^{(i)}\right)\right)-\left(1-y^{(i)}\right) \log \left(1-h_{\theta}\left(x^{(i)}\right)\right)\right]$

In [33]:
def cost_func(X, y, theta):
    m = y.size
    z = X.dot(theta) 
    h = sigmoid(z)
    # add 1e-4 to deal with log(0)
    loss = (-y.dot(np.log(h + 1e-4)) - (1 - y).dot(np.log(1 - h + 1e-4))) / m   
    #loss = (-y.dot(np.log(h)) - (1 - y).dot(np.log(1 - h))) / m 
    return loss

In [34]:
# initializing parameters
theta = np.zeros(31)
iters = 1000; 
lr = 1;  
# lr = 50, iters = 500000 to get better result

In [35]:
cost_func(X, y ,theta)

0.6929472005572793

## 3. Gradient Descent: $\theta_{j}=\theta_{j}-\alpha \frac{1}{m} \sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right) x_{j}^{(i)}$

In [36]:
# vectorization implementation of gradient descent
def grad_descent(X, y, theta, lr, iters):
    m = len(y)
    X_T = X.T   
    for i in range(iters):
        h = sigmoid(X.dot(theta))
        delta = X_T.dot(h - y)
        theta -= lr*delta / m
        print(cost_func(X, y, theta))
    return theta

In [37]:
learned_theta = grad_descent(X, y, theta, lr, iters)

0.16940818998606813
0.13526863786207657
0.11913940110266186
0.11150729610714967
0.10668593398160987
0.10297269605410392
0.0999011839099119
0.0972765760127895
0.0949907878315896
0.0929735604903328
0.09117503122734988
0.0895579472458902
0.08809355085085827
0.08675912944451715
0.08553643263380294
0.08441059090258768
0.0833693475070747
0.08240249708686284
0.08150146630591874
0.0806589950670787
0.07986889063045852
0.07912583558338113
0.07842523621690983
0.0777631016303349
0.07713594647944352
0.07654071210911564
0.07597470211662133
0.0754355293405306
0.07492107196818337
0.07442943697435024
0.07395892949467371
0.07350802703441568
0.07307535764054335
0.0726596813409166
0.07225987429110745
0.07187491517657965
0.07150387350254057
0.07114589947093283
0.07080021519767281
0.07046610706632463
0.07014291904918782
0.06983004685500972
0.06952693278556
0.06923306120217002
0.06894795451886225
0.06867116965152642
0.06840229486324129
0.06814094695470886
0.06788676875618033
0.0676394268834736
0.067398609725

0.047071534647691815
0.04706501995838288
0.04705851459530888
0.0470520185298738
0.04704553173361882
0.04703905417822102
0.047032585835493114
0.047026126677381964
0.04701967667596824
0.047013235803465274
0.04700680403221821
0.04700038133470332
0.04699396768352711
0.04698756305142562
0.046981167411263104
0.04697478073603178
0.04696840299885077
0.04696203417296572
0.04695567423174685
0.04694932314868982
0.04694298089741338
0.046936647451659694
0.04693032278529294
0.04692400687229901
0.046917699686784435
0.046911401202975475
0.04690511139521794
0.04689883023797595
0.04689255770583133
0.04688629377348316
0.04688003841574651
0.046873791607552236
0.046867553323945974
0.046861323540087596
0.04685510223125011
0.0468488893728198
0.04684268494029467
0.046836488909284164
0.04683030125550836
0.04682412195479744
0.0468179509830912
0.04681178831643747
0.046805633930992824
0.04679948780302072
0.0467933499088914
0.04678722022508149
0.046781098728172765
0.04677498539485186
0.0467688802019096
0.046762783

In [38]:
learned_theta

array([-0.12948648, -0.26247266, -0.25786444, -0.23355655, -0.42385543,
       -0.19343788,  1.15414035, -1.19203752, -1.41650753,  0.31403422,
        0.35100371, -2.11911777,  0.45748506, -0.98983429, -1.64744384,
       -0.45810669,  1.09954117,  0.18819926, -0.51937292,  0.41483154,
        1.09216663, -1.50987604, -2.04008953, -1.12726968, -1.51412922,
       -0.86908548,  0.1652365 , -1.30017933, -1.35763342, -1.33022925,
       -0.8754259 ])

## 4. Train Accuracy

In [39]:
def predict(X, theta):
    m = X.shape[0]
    p = np.round(sigmoid(X.dot(theta)))
    return p

In [40]:
p = predict(X, learned_theta)
print("Training Accuracy : {:.2f}%".format(np.mean(p == y) * 100))

Training Accuracy : 98.77%


In [41]:
initial_theta = np.zeros(31)
p = predict(X, initial_theta)
print("Training Accuracy : {:.2f}%".format(np.mean(p == y) * 100))

Training Accuracy : 37.26%
