In [1]:
import numpy as np
from utils_predictions_manipulation import*
from utils_nans_manipulation import*
from cross_validation import*
from utils_data_loading import*
from utils_features_manipulation import*

# Model testing on training data

## Loading and cleaning data

### Loading training data

In [11]:
traindata,_ = load_data('Data/train.csv')

### Format training data, add fake feature

In [12]:
X_total, Y_total = structure_data(traindata)

In [13]:
# Replacing undefined data with NaNs

X_nans = replace_bad_data_with_nans(X_total, -999)
Y_nans = Y_total
X_nans.shape


(250000, 31)

In [5]:
#RUN CELL TO REPLACE NaNs with median
X_nans = replace_nans_with_median(X_nans, threshold=0.5)


In [6]:
#RUN CELL TO BUILD POLY
index_list = list(range(31))
index_list.remove(0)
index_list.remove(23)

degree = [1/5, 1/4, 1/3, 1/2, 1, 2, 3, 4, 5]

X_nans = build_poly_index(X_nans, index_list, degree)
X_nans.shape

  coltmp=tx[:,i]**d
  coltmp=tx[:,i]**d


(250000, 200)

In [None]:
#RUN CELL TO STANDARDIZE DATA
X_nans = standardize_data(X_nans)
X_nans

### Cleaned Data names in the cell below!

In [None]:
'''import matplotlib.pyplot as plt

for i in range(X_nans.shape[1]):
    col = X_nans[:,i]
    print(i)
    plt.hist(col, bins=100)
    plt.show()'''

In [9]:
print(X_nans.shape, Y_nans.shape)


(250000, 200) (250000,)


# LPM

## Least Squares

In [20]:
meanacc_dtest_ls = []
meanacc_dtrain_ls = []

# Execute for data originally with no NaNs
dtmp_tr,dtmp_te=cross_validation(Y_nans,X_nans,k_fold=4,seed=1, function_name='least_squares')
meanacc_dtest_ls.append(dtmp_te)
meanacc_dtrain_ls.append(dtmp_tr)


In [21]:
print("No NaNs accuracy:")
print("Test:",meanacc_dtest_ls[0],"Data:", meanacc_dtrain_ls[0])

No NaNs accuracy:
Test: 0.657332 Data: 0.657332


<b>Comment: </b>same result we got in the other notebook. Fine accuracy.

## Gradient Descent

In [None]:
meanacc_dtest_gd = []
meanacc_dtrain_gd = []

from time import time

start = time()
# Execute for data originally with no NaNs
dtmp_tr,dtmp_te=cross_validation(Y_nans,X_nans,k_fold=4,seed=1, function_name='gd', gamma=0.000002, max_iters=1000)
meanacc_dtest_gd.append(dtmp_te)
meanacc_dtrain_gd.append(dtmp_tr)

end = time()
print(end - start)

In [None]:
print("No NaNs accuracy:")
print("Test:",meanacc_dtest_gd[0],"Data:", meanacc_dtrain_gd[0])

<b>Comment:</b> Apparently, GD is not converging fast enough. We get higher error than with Least Squares

## Stochastic Gradient Descent

In [None]:
from time import time

meanacc_dtest_sgd = []
meanacc_dtrain_sgd = []

now = time()
# Execute for data originally with no NaNs
dtmp_tr,dtmp_te=cross_validation(Y_nans,X_nans,k_fold=4,seed=1, function_name='sgd', gamma=0.0000001, max_iters=1000)
meanacc_dtest_sgd.append(dtmp_te)
meanacc_dtrain_sgd.append(dtmp_tr)

final = time()
print(final - now)

In [None]:
print("No NaNs accuracy:")
print("Test:",meanacc_dtest_sgd[0],"Data:", meanacc_dtrain_sgd[0])

<b>Comment: </b>same result we had with GD. Apparently is not even faster. Might be worth measuring with time().

## Ridge Regression

In [None]:
meanacc_dtest_rr = []
meanacc_dtrain_rr = []

for lambda_ in np.logspace(-5,0,15):
    # Execute for data originally with no NaNs
    dtmp_tr,dtmp_te=cross_validation(Y_nans,X_nans,k_fold=4,seed=1, function_name='ridge_regression', lambda_=lambda_)
    meanacc_dtest_rr.append(dtmp_te)
    meanacc_dtrain_rr.append(dtmp_tr)

In [None]:
"""
import matplotlib.pyplot as plt

plt.figure()
plt.plot(np.logspace(-5,0,15), meanacc_dtrain_rr[1::2])

plt.plot(np.logspace(-5,0,15), meanacc_dtrain_rr[::2])
plt.show()

print(meanacc_dtest_rr[::2])
"""

In [None]:
print("No NaNs accuracy:")
print("Test:\n",meanacc_dtest_rr[::2],"\n\n Data:\n", meanacc_dtrain_rr[::2])

# Logit

## Logistic Regression

In [None]:
meanacc_dtest_lr = []
meanacc_dtrain_lr = []

gamma = 5e-11

# Execute for data originally with no NaNs
dtmp_tr,dtmp_te=cross_validation(Y_nans,X_nans,k_fold=3,seed=1, function_name='logistic_regression', max_iters=10000,gamma=gamma)
meanacc_dtest_lr.append(dtmp_te)
meanacc_dtrain_lr.append(dtmp_tr)


In [None]:
print("No NaNs accuracy:")
print("Test:",meanacc_dtest_lr[0],"Data:", meanacc_dtrain_lr[0])


## Reg Logistic Regression

In [None]:
meanacc_dtest_rlr = []
meanacc_dtrain_rlr = []

gamma = 0.0000000005

for lambda_ in np.logspace(-5,0,5):
    # Execute for data originally with no NaNs
    dtmp_tr,dtmp_te=cross_validation(Y_no_nans,X_no_nans,k_fold=3,seed=1, function_name='reg_logistic_regression', max_iters=10000,gamma=gamma,lambda_=lambda_)
    meanacc_dtest_rlr.append(dtmp_te)
    meanacc_dtrain_rlr.append(dtmp_tr)

    # Execute for data originally with NaNs
    dtmp_tr,dtmp_te=cross_validation(Y_cleaned,X_cleaned,k_fold=3,seed=1, function_name='reg_logistic_regression', max_iters=10000,gamma=gamma,lambda_=lambda_)
    meanacc_dtest_rlr.append(dtmp_te)
    meanacc_dtrain_rlr.append(dtmp_tr)

In [None]:
print("No NaNs accuracy:")
print("Test:",meanacc_dtest_rlr[::2],"Data:", meanacc_dtrain_rlr[::2])
print("\nCleaned NaNs accuracy:", )
print("Test:",meanacc_dtest_rlr[1::2],"Data:", meanacc_dtrain_rlr[1::2])

Too slow! Find good lambda before running.

TODO:

Try with polynomial expansion and standardization