# Classification via logistic regression

In this set of exercises, you are going to implement logistic regression for email spam detection. We'll use the `Spambase` dataset from UCI Machine Learning Repository, you can find the description of the data from https://archive.ics.uci.edu/ml/datasets/spambase.  

Data = readdlm("spambase.data", ',')
size(Data)

Split the data into two sets: a training set and a testing set.

In [None]:
srand(1234)
y = Data[:,end]
y = 2*(y - 0.5)
X = Data[:,1:(end-1)]
n, d = size(X)
perm = randperm(n)
train_ratio = 0.8
n_train = round(Int, n*train_ratio )

Xtrain = X[ perm[1:n_train], : ]
Xtest = X[ perm[(n_train+1):end], : ]
ytrain = y[perm[1:n_train]]
ytest = y[ perm[(n_train+1):end] ]
;

Standardize the data to have zero mean.

In [None]:
mu = mean(Xtrain, 1)
sigma = zeros(d)
for i = 1:d
    sigma[i] = std(X[:,i])
    Xtrain[:,i] = (Xtrain[:,i] - mu[i]) / sigma[i]
    Xtest[:,i] = (Xtest[:,i] - mu[i]) / sigma[i]
end

The objective function for Logistic Regression with 2-norm regularization can be written as
$$ f(w) = \frac{1}{2} \|w\|^2 + C \sum_{i=1}^n \log(1 + \exp ( -y_i w^T x_i ) ). $$

## Question 1.
Complete the following code that evaluates the objective $f$.

In [None]:
function Obj(Xtrain, ytrain, w, C)
    #COMPLETE THE CODE
end;

## Question 2.
Derive an expression for the gradient $\nabla f(w)$.

## Question 3.
Complete the following code that evaluates the gradient.

In [None]:
function Grad(Xtrain, ytrain, w, C)
    #COMPLETE THE CODE
end;

## Question 4

Complete the function `gradient_descent`.

In [None]:
function gradient_descent(f, ∇f, w0, tol=1e-4, stepsize=1, maxIter=1000)

    # Initialization
    w = copy(w0)
    g = g0 = ∇f(w)
    norm_g0 = norm(g0)
    obj_new = pre_obj = f(w)
    iter_list = [0]
    obj_list = [pre_obj]
    
    # Gradient Descent Iteration
    iter = 0
    while true
        
        @printf "it = %3d | f = %10.2e | ||∇f||  = %10.2e\n" iter obj_new norm(g)
        
        # Stop if the norm of the gradient is small relative to its start value
        if norm(g) < tol*norm_g0 || iter > maxIter
            break
        end

        iter += 1
        
        
        #COMPLETE BACKTRACK LINE-SEARCH
        
        
        push!(iter_list, iter)
        push!(obj_list, obj_new)
        
    end
    return w, iter_list, obj_list    
end;

## Question 5

Train the classifier using $C=1$. Report your test error and plot a figure where xlabel is the number of iteration and ylabel is the objective value.

In [None]:
# run the following code after you complete the functions
C = 1
f(x) = Obj(Xtrain, ytrain, x, C)
∇f(x) = Grad(Xtrain, ytrain, x, C)
w0 = zeros(size(Xtrain,2))
w, iter_list, obj_list = gradient_descent(f, ∇f, w0);

Print the training error and testing error

In [None]:
ypred = sign.(Xtest*w)
@printf "prediction accuracy on training set: %10.2e \n" sum(sign.(Xtrain*w) .== ytrain) / length(ytrain)
@printf "prediction accuracy on testing  set: %10.2e \n" sum(ypred .== ytest) / length(ypred)

## Question 6

Derive an expression for the Hessian matrix $\nabla^2 f(w)$.

## Question 7
Complete the following code that evaluates the Hessian matrix.

In [None]:
function Hessian(Xtrain, ytrain, w, C)
    #COMPLETE THE CODE
end;

## Question 8

Complete the function `newton_descent`. Note that we use fixed stepsize. Report your test error and plot a figure where xlabel is the number of iterations and ylabel is the objective value. Does the test error and objective value match with the result from gradient descent?

In [None]:
function newton_descent(f, ∇f, ∇²f, w0, tol=1e-4, α=1, maxIter = 1000)
    
    #Initialization
    w = copy(w0)
    g = g0 = ∇f(w)
    H = ∇²f(w)
    norm_g0 = norm(g0)
    obj_value = f(w)
    iter_list = [0]
    obj_list = [obj_value]
    
    iter = 0   
    while true
        
        @printf "it = %3d | f = %10.2e | ||∇f||  = %10.2e\n" iter obj_value norm(g)
        
        # Stop if the norm of the gradient is small relative to its start value
        if norm(g) < tol*norm_g0 || iter > maxIter
            break
        end

        iter += 1
        
        
        #COMPLETE THE UPDATE RULE
        
        
        obj_value = f(w)
        
        push!(iter_list, iter)
        push!(obj_list, obj_value)
    
    end
    
    return w, iter_list, obj_list
end

In [None]:
# Run the following code after you complete the functions
C = 1
f(x) = Obj(Xtrain, ytrain, x, C)
∇f(x) = Grad(Xtrain, ytrain, x, C)
hessian(x) = Hessian(Xtrain, ytrain, x, C)
w0 = zeros(size(Xtrain,2))
w, iter_list, obj_list = newton_descent(f, ∇f, hessian, w0);

Print the training error and testing error

In [None]:
ypred = sign.(Xtest*w);
@printf "prediction accuracy on training set: %10.2e \n" sum(sign.(Xtrain*w) .== ytrain) / length(ytrain)
@printf "prediction accuracy on testing  set: %10.2e \n" sum(ypred .== ytest) / length(ypred)