In [None]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from plots import visualization
from implementations import *
from implementations import compute_loss
from proj1_helpers import *
from helpers import equalize_predictions

%load_ext autoreload
%autoreload 2

In [None]:
from helpers_ex5 import sample_data, load_data 
# load data.
height, weight, gender = load_data()

# build sampled x and y.
seed = 1
y = np.expand_dims(gender, axis=1)
X = np.c_[height.reshape(-1), weight.reshape(-1)]
y, X = sample_data(y, X, seed, size_samples=500)
x, mean_x, std_x = standardize(X)
y.shape, x.shape

In [None]:
y_sub = y
x_sub = build_poly(x, 1)
y_sub.shape, x_sub.shape

## load and clean boson data 

In [None]:
def clean(x_, y_):
    #y_,x_ = equalize_predictions(y_,x_)

    x_, ndropped = drop_corr_columns(x_, 0.7)
    print(ndropped, "columns have been dropped")
    x_ = fill_with_nan_list(x_, nan_values=[0, -999])
    # x_, y_ = drop_nan_rows(x_, y_)
    x_, mean_x, std_x = standardize(x_)
    x_ = sustitute_nans(x_, substitutions=np.nanmean(x_, axis=0)) 

    # chose the degree
#     degree = 7
#     tx_ = build_poly(x_, degree)
    tx_ = x_
    return tx_, y_


In [None]:
# load the boson data set
data_path = "../dataset/train.csv"
ndata = 250000
y_loaded, x_loaded, col_labels = load_partial_csv_data(data_path, ndata)#sub_sample=False)
y_loaded = y_loaded.reshape((-1, 1))
y_loaded[y_loaded==-1] = 0
y_loaded.shape, x_loaded.shape

In [None]:
subsample = 20000
indices = np.random.permutation(y_loaded.shape[0])
xsub_, ysub_ = x_loaded[:subsample], y_loaded[:subsample]
x_, y_ = clean(xsub_, ysub_)
y_[y_== -1] = 0

x200_, y200_ = clean(x_loaded, y_loaded)

x_.shape, y_.shape, x200_.shape, y200_.shape

In [None]:
# # take a subset
# n = 5000
# x_sub = tx_[:n, :]
# y_sub = y_boson[:n]
# y_sub[y_sub==-1] = 0

y_.shape, tx_.shape

## Logistic Regression

Compute your cost by negative log likelihood.

In [None]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1 + np.exp(-t))

In [None]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx @ w)
    loss = y.T @ (np.log(pred)) + (1 - y).T @ (np.log(1 - pred))
#     print((1 - y).T @ (np.log(1 - pred)))
    return np.squeeze(- loss) 

In [None]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx @ w)
    
    grad = tx.T @ (pred - y)
    return grad

### Using Gradient Descent
Implement your function to calculate the gradient for logistic regression.

In [None]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    loss = calculate_loss(y, tx, w) 
    grad = calculate_gradient(y, tx, w)
    w -= gamma * grad
    return loss, w

Demo!

In [None]:
def logistic_regression_gradient_descent_demo(y, tx, gamma_):
    # init parameters
    max_iter = 1500
    threshold = 1e-8
    gamma = gamma_
    losses = []

    # this is the y used to computer the success ratio (it has -1, and 1 instead of 0 and 1)
    y_correct = y_.copy()
    y_correct[y_correct== 0] = -1
    
    w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = -1
    
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        
        #gamma /= 1.002
#         succ_ratio = 1 - compute_loss(y_correct, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
        if loss < lowest_loss: # loss < lowest_loss
            lowest_loss = loss # loss
            best_w = w
            
        # log info
        if iter % 250 == 0:
            succ_ratio = compute_loss(y_correct, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio))
            if succ_ratio < 0.15:
                return w
            
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    # visualization(y, x[:, 1:], mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
    #print("loss={l}".format(l=calculate_loss(y, tx, w)))
    
    return best_w

In [None]:
# choose degree
degree = 2
tx_ = build_poly(x_, degree)
tx_.shape, y_.shape

In [None]:
ncolumns = x_.shape[1]

# # 5000 data
# gamma = np.concatenate([
#      # gamma for constant, 1st and 2nd degrees
#     np.ones(2*ncolumns+1)*1e-4, 
#     # gamma for 3rd degree 
#     np.ones(ncolumns)*1e-5,
#     # gamma for 4th degree
#     np.ones(ncolumns)*1e-7,
#     # gamma for 5th degree
#     np.ones(ncolumns)*1e-10,
# #     # gamma for 6th degree
# #     np.ones(ncolumns)*1e-11*0.6,
# #     # gamma for 7th degree 
# #     np.ones(ncolumns)*1e-14*0.3,
# #     # gamma for 8th degree 
# #     np.ones(ncolumns)*1e-15,
# #     # gamma for 9th degree 
# #     np.ones(ncolumns)*1e-18,
# #     # gamma for 10th degree 
# #     np.ones(ncolumns)*1e-20,
# ])\
# .reshape((-1, 1))

w = logistic_regression_gradient_descent_demo(y_, tx_, gamma)

In [None]:
# bruteforce
y_correct = y_.copy()
y_correct[y_correct== 0] = -1

best_ratio = 0
best_w = -1
gammas = np.logspace(-20, -15, 10)
bests = []
for degree in range(4, 18):
    tx_ = build_poly(x_, degree)
    for gamma in gammas:
        w = logistic_regression_gradient_descent_demo(y_, tx_, gamma)
        suc_ratio = compute_loss(y_correct, tx_, w, costfunc=CostFunction.SUCCESS_RATIO)
        if suc_ratio > 0.75:
            print("degree:", degree, ", gamma:", gamma, ", weigths:", w, ", succ_ratio:", suc_ratio)
            bests.append({"degree":degree, "gamma": gamma, "weigths": w, "succ_ratio": suc_ratio})

In [None]:
# for index, i in enumerate(bests):
#     if i["succ_ratio"] > 0.78:
#         print(index)
# bests[4]

In [None]:
# y_s_ = y200_.copy()
# y_s_[y_s_== 0] = -1
tx200_ = build_poly(x200_, degree)

compute_loss(y200_, tx200_, w, costfunc=CostFunction.SUCCESS_RATIO)

In [None]:
ncolumns = x_.shape[1]
len_ = len(w)
wei = np.array([w[i:i+ncolumns] for i in range(1, len_, ncolumns)])
(wei<0.001)
w, wei

### Using penalized logistic regression
Fill in the function below.

In [None]:
def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss and gradient."""
    num_samples = y.shape[0]
    loss = calculate_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
    return loss, gradient

In [None]:
def learning_by_penalized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss, gradient = penalized_logistic_regression(y, tx, w, lambda_)
    w -= gamma * gradient
    return loss, w

In [None]:
def logistic_regression_penalized_gradient_descent_demo(y, tx):
    # init parameters
    max_iter = 1000
    lambda_ = 0.1
    threshold = 1e-18
    
    gamma = 1e-7
    
    losses = []
    
    w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = -1
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, tx, w, gamma, lambda_)
        # update lambda
        # gamma /= 1.02
        
        if loss < lowest_loss:
            lowest_loss = loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    #visualization(y, x, mean_x, std_x, w, "classification_by_logistic_regression_penalized_gradient_descent")
    print("loss={l}".format(l=calculate_loss(y, tx, w)))
    
    return best_w
    
w = logistic_regression_penalized_gradient_descent_demo(y_, tx_)

In [None]:
y_sub_ = y_.copy()
y_sub_[y_sub_==0] = -1
compute_loss(y_sub_, tx_, w, costfunc=CostFunction.SUCCESS_RATIO)

In [None]:
y_.shape, tx_.shape

## Create submit file

In [None]:
data_path = "../dataset/test.csv"
yb, input_data, ids = load_csv_data(data_path, sub_sample=False)
test_tx,_ = clean(input_data, yb)
create_csv_submission(ids, predict_labels(w, test_tx), "trial")

In [None]:
w.shape, test_tx.shape