In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from plots import visualization
from implementations import *
from proj1_helpers import *
from helpers import equalize_predictions

%load_ext autoreload
%autoreload 2

In [2]:
from helpers_ex5 import sample_data, load_data 
# load data.
height, weight, gender = load_data()

# build sampled x and y.
seed = 1
y = np.expand_dims(gender, axis=1)
X = np.c_[height.reshape(-1), weight.reshape(-1)]
y, X = sample_data(y, X, seed, size_samples=500)
x, mean_x, std_x = standardize(X)
y.shape, x.shape

((500, 1), (500, 2))

In [3]:
y_sub = y
x_sub = build_poly(x, 1)
y_sub.shape, x_sub.shape

((500, 1), (500, 3))

## 1. Load boson data 

In [4]:
# just load the training dataset
data_path = "../dataset/train.csv"
y_loaded, x_loaded, ids_te = load_csv_data(data_path, sub_sample=False)
y_loaded = y_loaded.reshape((-1, 1))
y_loaded.shape, x_loaded.shape

((250000, 1), (250000, 30))

In [5]:
name = "PRI_jet_all_pt" #"PRI_jet_num"
np.where(keptCols == name), np.where(column_labels() == name)

NameError: name 'keptCols' is not defined

In [None]:
(x_loaded[np.where(y_loaded == +1), 29] == 0).sum()

## 2. Clean data

In [None]:
# decide the maximum correlation between the columns
corr = 0.7
# clean the input features
x_all, keptCols = clean_x(x_loaded, corr, subs_func=np.nanmean, bool_col=True)
y_all = y_loaded.copy()
y_all[y_all== -1] = 0

# extract a subsample for the training
subsample = 50000
indices = np.random.RandomState(seed = 6).permutation(y_all.shape[0]) # get always the same random array
x_sub, y_sub = x_all[indices[:subsample]], y_all[indices[:subsample]]

x_sub.shape, y_sub.shape, x_all.shape, y_all.shape, keptCols.shape

## 2.5 Possibily load previously obtained weights

In [None]:
file_path = "../miscellanea/best_weights/logistic_regression_17_columns_degree9_bool_col3_mean/weights"
obj_text = codecs.open(file_path, 'r', encoding='utf-8').read()
w = np.array(json.loads(obj_text))
w.shape

### 2.9 Choose the degree and set the gamma

In [None]:
# choose degree
degree = 8
tx_sub = build_poly(x_sub, degree)
tx_all = build_poly(x_all, degree)
tx_sub.shape, y_sub.shape

In [None]:
ncolumns = x_sub.shape[1]

# 50000 data, 14 columns, 999=mean, 0=mean
gamma = np.concatenate([
     # gamma for constant and 1st degree 
    np.ones(ncolumns+1)*1e-5,
    # gammma 2nd degree
    np.ones(ncolumns)*1e-6, 
    # gamma for 3rd degree 
    np.ones(ncolumns)*1e-7,
    # gamma for 4th degree
    np.ones(ncolumns)*1e-10,
    # gamma for 5th degree
    np.ones(ncolumns)*1e-12,
    # gamma for 6th degree
    np.ones(ncolumns)*1e-15,
    # gamma for 7th degree 
    np.ones(ncolumns)*1e-17,
    # gamma for 8th degree 
    np.ones(ncolumns)*1e-20,
#     # gamma for 9th degree 
#     np.ones(ncolumns)*1e-24,
#     # gamma for 10th degree 
#     np.ones(ncolumns)*1e-24,
#     # gamma for 11th degree 
#     np.ones(ncolumns)*1e-28,
])\
.reshape((-1, 1))*0.5
gamma.shape

## 3. Train the model

### Logistic Regression

In [None]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1 + np.exp(-t))

In [None]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx @ w)
    loss = y.T @ (np.log(pred)) + (1 - y).T @ (np.log(1 - pred))
#     print((1 - y).T @ (np.log(1 - pred)))
    return np.squeeze(- loss) 

In [None]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx @ w)
    
    grad = tx.T @ (pred - y)
    return grad

In [None]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    #loss = calculate_loss(y, tx, w) 
    loss = -1
    grad = calculate_gradient(y, tx, w)
    w -= gamma * grad
    return loss, w

In [None]:
def logistic_regression_gradient_descent_demo(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 1000
    threshold = 1e-8
    gamma = gamma_
    losses = []
    succ_ratio_test = -1
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))
    
    lowest_loss = float('Inf')
    best_w = -1
    
#     succ_ratios_test = []
#     succ_ratios_train = []
    
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        
        #gamma /= 1.002
        loss = 1 - compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
        if loss < lowest_loss: # loss < lowest_loss
            print(1-loss, "!!")
            lowest_loss = loss # loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
#             succ_ratio_test = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
#             succ_ratios_test.append(succ_ratio_test)
#             plt.scatter([iter], [succ_ratio], color= "red")
#             succ_ratio_train = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
#             succ_ratios_train.append(succ_ratio_train)
#             plt.scatter([iter], [succ_ratio_test], color="blue")
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=1-loss, pred=succ_ratio_test))
            
#         # converge criterion
#         losses.append(loss)
#         if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
#             break
            
    succ_ratio_test = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
#     succ_ratios_test.append(succ_ratio_test)
# #             plt.scatter([iter], [succ_ratio], color= "red")
#     succ_ratio_train = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
#     succ_ratios_train.append(succ_ratio_train)
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
    
    # visualization
    # visualization(y, x[:, 1:], mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
    #print("loss={l}".format(l=calculate_loss(y, tx, w)))
    
#     plt.legend(["train", "test"])
#     plt.title(["if train >> test => overfitting!"])     
#     plt.show()
    return best_w #, succ_ratio_test, succ_ratio_train

In [None]:
# w, s_te, s_tr 
for i in range(100):
    w = logistic_regression_gradient_descent_demo(y_sub, tx_sub, gamma, w)

In [None]:
# bruteforce
y_correct = y_.copy()
y_correct[y_correct== 0] = -1

best_ratio = 0
best_w = -1
gammas = np.logspace(-20, -15, 10)
bests = []
for degree in range(4, 18):
    tx_ = build_poly(x_, degree)
    for gamma in gammas:
        w = logistic_regression_gradient_descent_demo(y_, tx_, gamma)
        suc_ratio = compute_loss(y_correct, tx_, w, costfunc=CostFunction.SUCCESS_RATIO)
        if suc_ratio > 0.75:
            print("degree:", degree, ", gamma:", gamma, ", weigths:", w, ", succ_ratio:", suc_ratio)
            bests.append({"degree":degree, "gamma": gamma, "weigths": w, "succ_ratio": suc_ratio})

In [None]:
# for index, i in enumerate(bests):
#     if i["succ_ratio"] > 0.78:
#         print(index)
# bests[4]

In [None]:
compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

### logistic regression with cross validation

In [None]:
def log_reg_batch(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = gamma_
    losses = []
    
#     seed = 2
    k_sets = 5
#     k_indices = build_k_indices(y, k_sets, seed)
#     train=[]
#     test=[]
#     for i in range(k_sets):
#         tr, te = get_kth_set(y_all, tx_all, k_indices, i)
#         train.append(tr)
#         test.append(te)
    
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = -1
    
#     succ_ratios_test = []
#     succ_ratios_train = []
    
    # start the logistic regression
    for iter in range(max_iter):
        k_curr = iter % k_sets
        from_ = k_curr*50000
        to_ = (k_curr+1)*50000
        
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y[from_:to_], tx[from_:to_], w, gamma)
        
        #gamma /= 1.002
#         succ_ratio = 1 - compute_loss(y_correct, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
        if loss < lowest_loss: # loss < lowest_loss
            lowest_loss = loss # loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
            succ_ratio_test = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
#             succ_ratios_test.append(succ_ratio_test)
#             plt.scatter([iter], [succ_ratio], color= "red")
#             succ_ratio_train = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
#             succ_ratios_train.append(succ_ratio_train)
#             plt.scatter([iter], [succ_ratio_test], color="blue")
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
            
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    succ_ratio_test = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
#     succ_ratios_test.append(succ_ratio_test)
# #             plt.scatter([iter], [succ_ratio], color= "red")
#     succ_ratio_train = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
#     succ_ratios_train.append(succ_ratio_train)
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
    
    # visualization
    # visualization(y, x[:, 1:], mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
    #print("loss={l}".format(l=calculate_loss(y, tx, w)))
    
#     plt.legend(["train", "test"])
#     plt.title(["if train >> test => overfitting!"])     
#     plt.show()
    return best_w #, succ_ratio_test, succ_ratio_train

In [None]:
# w = logistic_regression_penalized_gradient_descent_demo(y_sub, tx_sub, gamma)# w)
w = log_reg_batch(y_all, tx_all, gamma)#, w)

### Store the current found weigths

In [None]:
file_path = "weigths"
json.dump(w.tolist(), codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4)

### Penalized logistic regression

In [None]:
def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss and gradient."""
    num_samples = y.shape[0]
    #loss = calculate_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    loss = -1
    gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
    return loss, gradient

In [None]:
def learning_by_penalized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss, gradient = penalized_logistic_regression(y, tx, w, lambda_)
    w -= gamma * gradient
    return loss, w

In [None]:
def log_reg_pen_batch(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 1000
    lambda_ = 1
    threshold = 1e-18
    
    gamma = gamma_
    
#     seed = 2
#     k_sets = 5
#     k_indices = build_k_indices_(y, k_sets)
#     train=[]
#     test=[]
#     for i in range(k_sets):
#         tr, te = get_kth_set(y_all, tx_all, k_indices, i)
#         train.append(tr)
#         test.append(te)
    
    losses = []
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = -1
    # start the logistic regression
    for iter in range(max_iter):
        # at each iteration take the next set
        shuffle_indices = np.random.permutation(np.arange(y.shape[0]))
        y_train = y[shuffle_indices[:50000]]
        tx_train = tx[shuffle_indices[:50000]]

        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y_train, tx_train, w, gamma, lambda_) # use test set which is smaller
        
        if loss < lowest_loss:
            lowest_loss = loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
            succ_ratio_test = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break

    succ_ratio_test = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
    
    return best_w

In [None]:
def logistic_regression_penalized_gradient_descent_demo(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 1000
    lambda_ = 1
    threshold = 1e-18
    
    gamma = gamma_
    succ_ratio_test = -1
    losses = []
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = w
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, tx, w, gamma, lambda_)
        
        loss = 1-compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
        if loss < lowest_loss:
            print(1-loss, "!!")
            lowest_loss = loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=1-loss, pred=succ_ratio_test))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break

    loss = 1-compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=1-loss, pred=succ_ratio_test))
    
    return best_w

In [None]:
#w = logistic_regression_penalized_gradient_descent_demo(y_sub, tx_sub, gamma, w)
w = logistic_regression_penalized_gradient_descent_demo(y_all[50000:100000], tx_all[50000:100000, :], gamma, w)
# w = log_reg_pen_batch(y_all, tx_all, gamma)# w)

In [None]:
w = w_curr_best.copy()

In [None]:
w_curr_best = w.copy()

In [None]:
compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

## 4. Create submit file

In [None]:
# load test dataset
data_path = "../dataset/test.csv"
y_te_loaded, x_te_loaded, ids_te = load_csv_data(data_path, sub_sample=False)
y_te_loaded.shape, x_te_loaded.shape

In [None]:
# clean in the same way of the trained set
x_te, kept_cols = clean_x(x_te_loaded, corr, subs_func=np.nanmean, bool_col=True)
x_te.shape

In [None]:
# # remove also the columns with "equal" distribution
# to_be_removed = np.where(np.isin(kept_cols, ["PRI_tau_phi", "PRI_lep_phi", "PRI_met_phi"])) # PRI_jet_num
# x_te = np.delete(x_te, to_be_removed, axis=1)
# x_te.shape

In [None]:
# create the poly
# degree = 8
tx_te = build_poly(x_te, degree)

# predict
y_te_pred = predict_labels(w, tx_te)
y_te_pred.shape, (y_te_pred==-1).sum(), (y_te_pred==1).sum()
# ((568238, 1), 391856, 176382)

In [None]:
# store the predictions
create_csv_submission(ids_te, y_te_pred, "logistic_regression_pen_18_columns_degree8_bool_col3_mean_PRI_jet_all_pt")