In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from plots import visualization
from implementations import *
from proj1_helpers import *
from helpers import equalize_predictions

%load_ext autoreload
%autoreload 2

In [None]:
from helpers_ex5 import sample_data, load_data 
# load data.
height, weight, gender = load_data()

# build sampled x and y.
seed = 1
y = np.expand_dims(gender, axis=1)
X = np.c_[height.reshape(-1), weight.reshape(-1)]
y, X = sample_data(y, X, seed, size_samples=500)
x, mean_x, std_x = standardize(X)
y.shape, x.shape

In [None]:
y_sub = y
x_sub = build_poly(x, 1)
y_sub.shape, x_sub.shape

## 1. Load boson data 

In [2]:
# just load the training dataset
data_path = "../dataset/train.csv"
y_loaded, x_loaded, ids_te = load_csv_data(data_path, sub_sample=False)
y_loaded = y_loaded.reshape((-1, 1))
y_loaded.shape, x_loaded.shape

((250000, 1), (250000, 30))

## 2. Clean data

### clean_x

In [None]:
# decide the maximum correlation between the columns
corr = 0.7
# clean the input features
x_all, keptCols = clean_x(x_loaded, corr, subs_func=np.nanmean, bool_col=True)
y_all = y_loaded.copy()
y_all[y_all== -1] = 0

# extract a subsample for the training
subsample = 50000
indices = np.random.RandomState(seed = 6).permutation(y_all.shape[0]) # get always the same random array
x_sub, y_sub = x_all[indices[:subsample]], y_all[indices[:subsample]]

x_sub.shape, y_sub.shape, x_all.shape, y_all.shape, keptCols.shape

### clean_x2

In [None]:
# load the correlation matrix that was computed on the whole dataset
file_path = "corr_matrix.json"
obj_text = codecs.open(file_path, 'r', encoding='utf-8').read()
b_new = json.loads(obj_text)
corr_matrix_loaded = np.array(b_new)
corr_matrix_bool = np.abs(corr_matrix_loaded) > 0.85
ncols = 30
# i is surely correlated to itself, drop that (useless) information
for i in range(ncols):
    corr_matrix_bool[i][i] = False


# compute the mapping of correlations
corr = {} 
for i in range(ncols):
    c = np.where(corr_matrix_bool[i])[0].tolist()
    if len(c) > 0: # if it is not correlated to any other column then ignore it
        corr[i] = c
corr

In [None]:
y_all = y_loaded.copy()
y_all[y_all== -1] = 0

x_all = clean_x2(x_loaded, double=False)

subsample = 50000
indices = np.random.RandomState(seed = 6).permutation(y_all.shape[0]) # get always the same random array
x_sub, y_sub = x_all[indices[:subsample]], y_all[indices[:subsample]]
x_sub.shape, y_sub.shape

### clean_x3

In [None]:
np.sum(np.isnan(c))

In [None]:
np.sum(x_loaded[:, 0]==-999)

In [4]:
asd = x_loaded.copy()
asd = fill_with_nan_list(asd, nan_values=[0, -999])
col = 11
c = asd[:, col]
med = np.nanmedian(c)
print(med)
asd = np.insert(asd, -1, c*(c>=med), axis = 1)
asd[:, col] = c*(c<med)

-0.357


In [None]:
np.where(x_loaded[:, 3]>2500) #7343, 7343

In [8]:
a, _, _ = standardize(c*(c>=med))
x_all[:, 10]

array([ 1.27697269,  1.29205176,  1.29205176, ...,  0.55066413,
        1.29205176, -1.07452459])

In [42]:
y_all = y_loaded.copy()
y_all[y_all== -1] = 0

x_all, bool_cols = clean_x3(x_loaded)

x_all.shape, y_all.shape, bool_cols.shape
# 61

7 boolean columns have been created.
6 columns have been removed:  [9, 15, 18, 20, 22, 23]
0 -0.0331691795196 124999 125001
1 -0.0768266999163 124999 125001
2 -0.181979788603 124996 125004
3 -0.305090158949 125000 125000
4 0.0 32434 217566
5 0.0 23922 226078
6 0.0 42490 207510
7 0.151230950719 125000 125000
8 -0.295694700715 124999 125001
9 -0.186577169085 124943 125057
10 -0.190723506948 124989 125011
11 0.0 32114 217886
12 -0.308022848817 124995 125005
13 -0.00990625779749 124983 125017
14 -0.278460962776 124996 125004
15 -0.020148954398 124978 125022
16 -0.210223828937 124996 125004
17 -0.23761545611 124999 125001
18 0.0 75105 174895
19 0.0 74543 175457
20 0.0 25031 224969
21 0.0 36291 213709
22 0.0 36256 213744
23 -0.0331691795196 124999 125001
0 0.0 82316 167684
1 0.0 119315 130685
2 0.0 95838 154162
3 0.0 88788 161212
4 0.0 32434 217566
5 0.0 23922 226078
6 0.0 42490 207510
7 0.0759347970474 125000 125000
8 0.0 110272 139728
9 0.0 101234 148766
10 0.0 119052 130948
11 0.0 32114 2

((250000, 61), (250000, 1), (250000, 7))

In [44]:
# choose degree
degree = 5

# build poly from x_all
tx_all = build_poly(x_all, degree)
# then append the boolean columns
tx_all = np.hstack((bool_cols, tx_all))

# select a subset
subsample = 50000
indices = np.random.RandomState(seed = 6).permutation(y_all.shape[0]) # get always the same random array
tx_sub, y_sub = tx_all[indices[:subsample]], y_all[indices[:subsample]]

tx_all.shape, y_all.shape, tx_sub.shape, y_sub.shape

((250000, 313), (250000, 1), (50000, 313), (50000, 1))

## 2.5 Possibily load previously obtained weights

In [None]:
file_path = "../miscellanea/best_weights/logistic_regression_44_columns_degree8_keep_all_add_columns/weights"
obj_text = codecs.open(file_path, 'r', encoding='utf-8').read()
w = np.array(json.loads(obj_text))
w.shape

### 2.9 Choose the degree and set the gamma

In [None]:
# choose degree
degree = 7
tx_sub = build_poly(x_sub, degree)
tx_all = build_poly(x_all, degree)
tx_sub.shape, y_sub.shape

In [45]:
ncolumns = x_sub.shape[1]

# 50000 data, 14 columns, 999=mean, 0=mean
gamma = np.concatenate([
#      # gamma for constant and 1st degree 
#     np.ones(ncolumns+1)*1e-5,
#     # gammma 2nd degree
#     np.ones(ncolumns)*1e-6, 
#     # gamma for 3rd degree 
#     np.ones(ncolumns)*1e-7,
#     # gamma for 4th degree
#     np.ones(ncolumns)*1e-10,
#     # gamma for 5th degree
#     np.ones(ncolumns)*1e-12,
#     # gamma for 6th degree
#     np.ones(ncolumns)*1e-15,
#     # gamma for 7th degree 
#     np.ones(ncolumns)*1e-17,
#     # gamma for 8th degree 
#     np.ones(ncolumns)*1e-20,
#     # gamma for 9th degree 
#     np.ones(ncolumns)*1e-24,
#     # gamma for 10th degree 
#     np.ones(ncolumns)*1e-24,
#     # gamma for 11th degree 
#     np.ones(ncolumns)*1e-28,
])\
.reshape((-1, 1))*1
gamma.shape

NameError: name 'x_sub' is not defined

In [51]:
# for clean_x3
n_bool_cols = bool_cols.shape[1]
n_other_cols = x_all.shape[1]

# 50000 data, 14 columns, 999=mean, 0=mean
gamma = np.concatenate([
    # gamma for bool cols
    np.ones(n_bool_cols)*1e-5,
     # gamma for constant and 1st degree 
    np.ones(n_other_cols+1)*1e-5,
    # gammma 2nd degree
    np.ones(n_other_cols)*1e-6, 
    # gamma for 3rd degree 
    np.ones(n_other_cols)*1e-7,
    # gamma for 4th degree
    np.ones(n_other_cols)*1e-9,
    # gamma for 5th degree
    np.ones(n_other_cols)*1e-13,
#     # gamma for 6th degree
#     np.ones(n_other_cols)*1e-16,
#     # gamma for 7th degree 
#     np.ones(n_other_cols)*1e-19,
#     # gamma for 8th degree 
#     np.ones(n_other_cols)*1e-20,
#     # gamma for 9th degree 
#     np.ones(n_other_cols)*1e-24,
#     # gamma for 10th degree 
#     np.ones(n_other_cols)*1e-24,
#     # gamma for 11th degree 
#     np.ones(n_other_cols)*1e-28,
])\
.reshape((-1, 1))*0.05

gamma.shape

(313, 1)

## 3. Train the model

### Logistic Regression

In [18]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1 + np.exp(-t))

In [19]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx @ w)
    loss = y.T @ (np.log(pred)) + (1 - y).T @ (np.log(1 - pred))
#     print((1 - y).T @ (np.log(1 - pred)))
    return np.squeeze(- loss) 

In [20]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx @ w)
    
    grad = tx.T @ (pred - y)
    return grad

In [21]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    #loss = calculate_loss(y, tx, w) 
    loss = -1
    grad = calculate_gradient(y, tx, w)
    w -= gamma * grad
    return loss, w

In [53]:
def logistic_regression_gradient_descent_demo(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 5000
    threshold = 1e-8
    gamma = gamma_
    losses = []

    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))
    
    highest_ratio = 0
    best_w = -1
    
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        
        succ_ratio = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
        if succ_ratio > highest_ratio: # loss < lowest_loss
            #print(1-loss, "!!")
            highest_ratio = succ_ratio
            best_w = w
        
        if iter % 1000 == 0:
            gamma = gamma/2
            
        # log info
        if iter % 100 == 0:
            loss = calculate_loss(y, tx, w) 
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio))
            
#         # converge criterion
#         losses.append(loss)
#         if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
#             break
            
    succ_ratio = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
    loss = calculate_loss(y, tx, w) 
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio))
    
    # visualization
    # visualization(y, x[:, 1:], mean_x, std_x, w, "classification_by_logistic_regression_gradient_descent")
    #print("loss={l}".format(l=calculate_loss(y, tx, w)))

    return best_w

In [23]:
y_sub.shape, tx_sub.shape

((50000, 1), (50000, 200))

In [54]:
# w, s_te, s_tr 
# for i in range(100):
w = logistic_regression_gradient_descent_demo(y_sub, tx_sub, gamma, w)



Current iteration=0, loss=nan, prediction=0.820396
Current iteration=100, loss=nan, prediction=0.820416
Current iteration=200, loss=nan, prediction=0.820424
Current iteration=300, loss=nan, prediction=0.820432
Current iteration=400, loss=nan, prediction=0.820444
Current iteration=500, loss=nan, prediction=0.820436
Current iteration=600, loss=nan, prediction=0.820428
Current iteration=700, loss=nan, prediction=0.820428
Current iteration=800, loss=nan, prediction=0.820444
Current iteration=900, loss=nan, prediction=0.820436
Current iteration=1000, loss=nan, prediction=0.82044
Current iteration=1100, loss=nan, prediction=0.82044
Current iteration=1200, loss=nan, prediction=0.820448
Current iteration=1300, loss=nan, prediction=0.820464
Current iteration=1400, loss=nan, prediction=0.820464
Current iteration=1500, loss=nan, prediction=0.820464
Current iteration=1600, loss=nan, prediction=0.82046
Current iteration=1700, loss=nan, prediction=0.820468
Current iteration=1800, loss=nan, predictio

In [55]:
compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

0.820492

### logistic regression with cross validation

In [None]:
def log_reg_batch(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = gamma_
    losses = []
    
    
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = -1

    # start the logistic regression
    for iter in range(max_iter):
        k_curr = iter % k_sets
        from_ = k_curr*50000
        to_ = (k_curr+1)*50000
        
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y[from_:to_], tx[from_:to_], w, gamma)
        

#         succ_ratio = 1 - compute_loss(y_correct, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
        if loss < lowest_loss: # loss < lowest_loss
            lowest_loss = loss # loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
            succ_ratio_test = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
            
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    succ_ratio_test = compute_loss(y, tx, w, costfunc=CostFunction.SUCCESS_RATIO)
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))

    return best_w #, succ_ratio_test, succ_ratio_train

In [None]:
# w = logistic_regression_penalized_gradient_descent_demo(y_sub, tx_sub, gamma)# w)
w = log_reg_batch(y_all, tx_all, gamma, w)

### Store the current found weigths

In [None]:
file_path = "weigths"
json.dump(w.tolist(), codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4)

### Penalized logistic regression

In [None]:
def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss and gradient."""
    num_samples = y.shape[0]
    #loss = calculate_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    loss = -1
    gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
    return loss, gradient

In [None]:
def learning_by_penalized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss, gradient = penalized_logistic_regression(y, tx, w, lambda_)
    w -= gamma * gradient
    return loss, w

In [None]:
def log_reg_pen_batch(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 1000
    lambda_ = 1
    threshold = 1e-18
    
    gamma = gamma_
    
#     seed = 2
#     k_sets = 5
#     k_indices = build_k_indices_(y, k_sets)
#     train=[]
#     test=[]
#     for i in range(k_sets):
#         tr, te = get_kth_set(y_all, tx_all, k_indices, i)
#         train.append(tr)
#         test.append(te)
    
    losses = []
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))

    highest_ratio = 0
    best_w = -1

    # start the logistic regression
    for iter in range(max_iter):
#         # at each iteration take the next set
#         shuffle_indices = np.random.permutation(np.arange(y.shape[0]))
#         y_train = y[shuffle_indices[:50000]]
#         tx_train = tx[shuffle_indices[:50000]]

        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y_train, tx_train, w, gamma, lambda_) # use test set which is smaller
        
        succ_ratio = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
        if succ_ratio > highest_ratio: # loss < lowest_loss
            #print(1-loss, "!!")
            highest_ratio = succ_ratio
            best_w = w
            
        # log info
        if iter % 100 == 0:
            loss = calculate_loss(y, tx, w) 
            succ_ratio = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break

    succ_ratio = compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
    loss = calculate_loss(y, tx, w) 
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio))
    
    return best_w

In [None]:
def logistic_regression_penalized_gradient_descent_demo(y, tx, gamma_, initial_w = np.array([])):
    # init parameters
    max_iter = 10000
    lambda_ = 1
    threshold = 1e-18
    
    gamma = gamma_
    succ_ratio_test = -1
    losses = []
    w = initial_w
    if initial_w.size == 0:
        w = np.zeros((tx.shape[1], 1))

    lowest_loss = float('Inf')
    best_w = w
    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, tx, w, gamma, lambda_)
        
        loss = 1-compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
        if loss < lowest_loss:
            print(1-loss, "!!")
            lowest_loss = loss
            best_w = w
            
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=loss, pred=succ_ratio_test))
        # converge criterion
        losses.append(loss)
#         if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
#             break

    loss = 1-compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)
    print("Current iteration={i}, loss={l}, prediction={pred}".format(i=iter, l=1-loss, pred=succ_ratio_test))
    
    return best_w

In [None]:
#w = logistic_regression_penalized_gradient_descent_demo(y_sub, tx_sub, gamma, w)
w = logistic_regression_penalized_gradient_descent_demo(y_sub, tx_sub, gamma, w)
# w = log_reg_pen_batch(y_all, tx_all, gamma)# w)

In [None]:
w = w_curr_best.copy()

In [None]:
w_curr_best = w.copy()

In [64]:
compute_loss(y_all, tx_all, w, costfunc=CostFunction.SUCCESS_RATIO)

0.820492

## 4. Create submit file

In [57]:
# load test dataset
data_path = "../dataset/test.csv"
y_te_loaded, x_te_loaded, ids_te = load_csv_data(data_path, sub_sample=False)
y_te_loaded.shape, x_te_loaded.shape

((568238,), (568238, 30))

##### clean_x2

In [None]:
# clean in the same way of the trained set
# x_te, kept_cols = clean_x(x_te_loaded, corr, subs_func=np.nanmean, bool_col=True)
x_te = clean_x2(x_te_loaded)
x_te.shape

In [None]:
# create the poly
# degree = 8
tx_te = build_poly(x_te, degree)

#### clean_x3

In [60]:
x_te, bool_cols_te = clean_x3(x_te_loaded)
x_te.shape, bool_cols_te.shape

7 boolean columns have been created.
6 columns have been removed:  [9, 15, 18, 20, 22, 23]
Dropped 87 equal columns


((568238, 61), (568238, 7))

In [61]:
# create the poly
tx_te = build_poly(x_te, degree) 
tx_te = np.hstack((bool_cols_te, tx_te))

#### Predict and create file

In [62]:
# predict
y_te_pred = predict_labels(w, tx_te)
y_te_pred.shape, (y_te_pred==-1).sum(), (y_te_pred==1).sum()
# ((568238, 1), 391856, 176382)

((568238, 1), 314794, 253444)

In [None]:
# # remove also the columns with "equal" distribution
# to_be_removed = np.where(np.isin(kept_cols, ["PRI_tau_phi", "PRI_lep_phi", "PRI_met_phi"])) # PRI_jet_num
# x_te = np.delete(x_te, to_be_removed, axis=1)
# x_te.shape

In [63]:
# store the predictions
create_csv_submission(ids_te, y_te_pred, "logistic_regression_clean_x3_degree5")