In [2]:
##Ridge regression 

import os
import matplotlib.pyplot as plt
import numpy as np
import scripts.implementations as lib  # Add personal library
import scripts.proj1_helpers as helper  # Add personal library

%matplotlib inline
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)

DATA_FOLDER = 'data'
DATA_TRAIN = os.path.join(DATA_FOLDER, 'train.csv')
DATA_TEST = os.path.join(DATA_FOLDER, 'test.csv')

y, x, ids, header = helper.load_csv_data(DATA_TRAIN)
y_train, x_train,  y_validation, x_validation = lib.sep_valid_train_data(x,y, 0.8);
print(y_train.shape)

(200000,)


In [3]:
x_train[x_train == -999] = np.nan
x_validation[x_validation == -999] = np.nan

In [4]:
for i, feature in enumerate(x_train.T):
    print('Feature {} - {} has range: [{:.4f}, {:.4f}]'.format(
        i+1, header[i], np.nanmin(feature), np.nanmax(feature)))

Feature 1 - DER_mass_MMC has range: [9.0440, 1192.0260]
Feature 2 - DER_mass_transverse_met_lep has range: [0.0000, 595.8190]
Feature 3 - DER_mass_vis has range: [6.4620, 1329.9130]
Feature 4 - DER_pt_h has range: [0.0000, 1053.8070]
Feature 5 - DER_deltaeta_jet_jet has range: [0.0000, 8.5030]
Feature 6 - DER_mass_jet_jet has range: [13.6020, 4974.9790]
Feature 7 - DER_prodeta_jet_jet has range: [-18.0660, 16.6900]
Feature 8 - DER_deltar_tau_lep has range: [0.2080, 5.6840]
Feature 9 - DER_pt_tot has range: [0.0000, 513.6590]
Feature 10 - DER_sum_pt has range: [46.1040, 1852.4620]
Feature 11 - DER_pt_ratio_lep_tau has range: [0.0470, 19.7730]
Feature 12 - DER_met_phi_centrality has range: [-1.4140, 1.4140]
Feature 13 - DER_lep_eta_centrality has range: [0.0000, 1.0000]
Feature 14 - PRI_tau_pt has range: [20.0000, 622.8620]
Feature 15 - PRI_tau_eta has range: [-2.4990, 2.4970]
Feature 16 - PRI_tau_phi has range: [-3.1420, 3.1420]
Feature 17 - PRI_lep_pt has range: [26.0000, 461.8960]
Fea

In [5]:
# Remove features with NaN
keep_id = np.nonzero(np.sum(np.isnan(x_train), axis=0) == 0)[0]
x_naive = x_train[:, keep_id]
# normalize features
x_naive = (x_naive - np.mean(x_naive, axis=0))/np.std(x_naive, axis=0)

keep_id_val = np.nonzero(np.sum(np.isnan(x_validation), axis=0) == 0)[0]
x_naive_val = x_validation[:, keep_id]
# normalize features
x_naive_val = (x_naive_val - np.mean(x_naive_val, axis=0))/np.std(x_naive_val, axis=0)

np.sum(np.isnan(x_naive_val))

0

In [None]:
from scripts.ml import cross_validation_ls

degrees = np.linspace(1, 6, 6).astype(int)
for i, degree in enumerate(degrees):
    acc, _, _ = cross_validation_ls(y_train, x_naive, degree=degree)
    print('{}/{} Least square deg {} with acc {:.4f}'.format(i+1, len(degrees), degree, acc))

In [7]:
def plot_train_test(train_errors, test_errors, lambdas, degree):
    """
    train_errors, test_errors and lambas should be list (of the same size) the respective train error and test error for a given lambda,
    * lambda[0] = 1
    * train_errors[0] = RMSE of a ridge regression on the train set
    * test_errors[0] = RMSE of the parameter found by ridge regression applied on the test set
    
    degree is just used for the title of the plot.
    """
    plt.semilogx(lambdas, train_errors, color='b', marker='*', label="Train error")
    plt.semilogx(lambdas, test_errors, color='r', marker='*', label="Test error")
    plt.xlabel("lambda")
    plt.ylabel("RMSE")
    plt.title("Ridge regression for polynomial degree " + str(degree))
    leg = plt.legend(loc=1, shadow=True)
    leg.draw_frame(False)
    plt.savefig("ridge_regression")
    
def test_ridge_regression(x, y, x_val, y_val, degrees, lambdas):
    
    best_acc = 0
    best_degree = 0
    best_lambda = 0
    best_rmse_tr = []
    best_rmse_te = []
    best_weights = []
    for degree in degrees:
        degree = int(degree)
        #lambdas = np.logspace(-7, 2, 20)

        # Split sets
        #x_train, x_test, y_train, y_test = split_data(x, y, ratio, seed)

        # Get ploynomial
        phi_train = lib.build_poly(x, degree)
        phi_test = lib.build_poly(x_val, degree)

        rmse_tr = []
        rmse_te = []
        update_rmse = False

        for ind, lambda_ in enumerate(lambdas):

            mse_tr, weights = lib.ridge_regression(y, phi_train, lambda_)
            mse_te = lib.compute_loss(y_val, phi_test.dot(weights))
            rmse_tr.append(np.sqrt(2*mse_tr))
            rmse_te.append(np.sqrt(2*mse_te))

            print("degree={d}, lambda={l:.3f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
                    d=degree, l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))
            print('train acc : ', lib.accuracy(y, phi_train.dot(weights)))
            val_acc = lib.accuracy(y_val, phi_test.dot(weights))
            print('validation acc : ', val_acc)

            if(val_acc > best_acc):
                best_acc = val_acc
                best_degree = degree
                best_lambda = lambda_
                best_weights = weights
                update_rmse = True
        
        if(update_rmse):
            best_rmse_tr = rmse_tr
            best_rmse_te = rmse_te

        # Plot the best obtained results
    plot_train_test(best_rmse_tr, best_rmse_te, lambdas, best_degree)

    print('Best params for Ridge regression : degree = ',best_degree, ', lambda = ',best_lambda,', accuracy = ', best_acc)
    
    return best_weights, best_degree, best_lambda

In [None]:
weights_naive = test_ridge_regression(x_naive, y_train, x_naive_val, y_validation, degrees = np.linspace(1,5,5), lambdas=np.logspace(-7,2,20))

## Ridge with no_nan


In [11]:
def mad(x_feat):
    """ Median Absolute Deviation: a "Robust" version of standard deviation.
        Indices variabililty of the sample.
        https://en.wikipedia.org/wiki/Median_absolute_deviation 
    """
    mad_res = np.ones(x_feat.shape[1])
    for i, col in enumerate(x_feat.T):
        arr = np.ma.array(col).compressed() # should be faster to not use masked arrays.
        med = np.nanmedian(arr)
        mad_res[i] = np.nanmedian(np.abs(arr - med))

    return mad_res

# normalize features
#x_no_nan = x_train.copy()
#x_no_nan = (x_no_nan - np.nanmean(x_no_nan, axis=0))/np.nanstd(x_no_nan, axis=0)
#x_no_nan = np.nan_to_num(x_no_nan)
#print('\nStd:', np.std(x_no_nan, axis=0))

# normalize features
#x_no_nan_val = x_validation.copy()
#x_no_nan_val = (x_no_nan_val - np.nanmean(x_no_nan_val, axis=0))/np.nanstd(x_no_nan_val, axis=0)
#x_no_nan_val = np.nan_to_num(x_no_nan_val)
#print('\nStd:', np.std(x_no_nan_val, axis=0))


# normalize features
x_no_nan = x_train.copy()
x_no_nan = (x_no_nan - np.nanmedian(x_no_nan, axis=0))/mad(x_no_nan)
x_no_nan = np.nan_to_num(x_no_nan)
print('\nStd:', mad(x_no_nan))

# normalize features
x_no_nan_val = x_validation.copy()
x_no_nan_val = (x_no_nan_val - np.nanmedian(x_no_nan_val, axis=0))/mad(x_no_nan_val)
x_no_nan_val = np.nan_to_num(x_no_nan_val)
print('\nStd:', mad(x_no_nan_val))


Std: [ 0.8028  1.      1.      1.      0.      0.      0.      1.      1.      1.
  1.      1.      0.      1.      1.      1.      1.      1.      1.      1.
  1.      1.      1.      0.3496  0.3249  0.3397  0.      0.      0.      1.    ]

Std: [ 0.8057  1.      1.      1.      0.      0.      0.      1.      1.      1.
  1.      1.      0.      1.      1.      1.      1.      1.      1.      1.
  1.      1.      1.      0.3393  0.3182  0.3227  0.      0.      0.      1.    ]


In [21]:
weights_no_nan, degree_no_nan, lambda_no_nan = test_ridge_regression(
    x_no_nan, y_train, x_no_nan_val, y_validation, degrees = np.linspace(9,11,3), lambdas=np.logspace(-3,2,20))

degree=9, lambda=0.001, Training RMSE=0.766, Testing RMSE=80211885.892
train acc :  0.801085
validation acc :  0.8017
degree=9, lambda=0.002, Training RMSE=0.784, Testing RMSE=78531707.034
train acc :  0.786765
validation acc :  0.78888
degree=9, lambda=0.003, Training RMSE=0.807, Testing RMSE=117874341.983
train acc :  0.771325
validation acc :  0.7746
degree=9, lambda=0.006, Training RMSE=0.831, Testing RMSE=2323196.515
train acc :  0.757265
validation acc :  0.76094
degree=9, lambda=0.011, Training RMSE=0.853, Testing RMSE=145417003.670
train acc :  0.7459
validation acc :  0.75096
degree=9, lambda=0.021, Training RMSE=0.873, Testing RMSE=225176022.774
train acc :  0.736685
validation acc :  0.74058
degree=9, lambda=0.038, Training RMSE=0.887, Testing RMSE=237704816.384
train acc :  0.7311
validation acc :  0.73594
degree=9, lambda=0.070, Training RMSE=0.898, Testing RMSE=197508943.449
train acc :  0.729445
validation acc :  0.73438
degree=9, lambda=0.127, Training RMSE=0.910, Testi

KeyboardInterrupt: 

## Submission


In [13]:
y_test, x_test, ids_test, header = helper.load_csv_data(DATA_TEST)
x_test[x_test == -999] = np.nan

x_no_nan_test = x_test.copy()
x_no_nan_test = (x_no_nan_test - np.nanmedian(x_no_nan_test, axis=0))/mad(x_no_nan_test)
x_no_nan_test = np.nan_to_num(x_no_nan_test)
print('\nStd:', mad(x_no_nan_test))


Std: [ 0.806   1.      1.      1.      0.      0.      0.      1.      1.      1.
  1.      1.      0.      1.      1.      1.      1.      1.      1.      1.
  1.      1.      1.      0.3434  0.3219  0.3325  0.      0.      0.      1.    ]


In [16]:
degree_opt = degree_no_nan
weights_opt = weights_no_nan

_phi_test = lib.build_poly(x_no_nan_test, degree_opt)
y_pred = helper.predict_labels(weights_opt, _phi_test)

In [17]:
helper.create_csv_submission(ids_test, y_pred, 'ridge_no_nan1.csv')
print('Results saved ...')

Results saved ...


4.9366616989567822

In [15]:
weights_no_nan

array([ -2.5007e+00,   2.4089e-01,  -2.9851e-01,  -1.1887e-01,
         3.1534e-02,   8.3152e-02,  -1.9170e-01,  -9.6213e-02,
         1.3895e-01,  -8.3005e-03,  -1.5844e+01,  -6.4242e-02,
        -2.6528e-02,   1.8494e-01,   2.8304e+00,   1.0322e-02,
         4.8882e-03,   3.0208e+00,   2.3943e-05,  -7.7355e-03,
        -1.4934e-02,   1.7966e-03,   1.8156e-03,  -4.5326e+02,
         1.0046e-01,  -3.1350e-03,  -9.9632e-03,   7.8024e-02,
        -1.4509e-02,  -8.1302e-03,   1.2438e+01,  -7.7497e-02,
        -6.1699e-02,  -4.7711e-03,   3.5751e-02,   2.0785e-01,
         8.8383e-02,  -1.1613e-02,   2.5124e-02,  -2.4437e-02,
         1.2382e-02,   1.9290e-02,  -2.5409e-01,  -1.0890e-01,
        -1.1180e-02,   1.2220e-02,  -2.9676e-02,  -3.1725e-02,
        -1.7927e-02,   1.9874e-02,   9.2610e-03,   1.2987e-02,
        -3.1272e-02,   2.3862e+02,  -3.0227e-02,   5.7474e-02,
        -2.5388e-03,  -1.4036e-02,   1.0665e-01,  -5.2222e-02,
         4.1641e-02,  -8.8537e-03,   6.9729e-02,  -5.30