In [10]:
import pandas as pd
import numpy as np
import random
from implementations import *
from helpers import load_csv_data

In [1]:
seed = 7

In [113]:
train_labels, train_data, train_ids = load_csv_data("data/train.csv")
_, test_data, test_ids = load_csv_data("data/train.csv")


In [119]:
train_data_mean = train_data.mean(axis=0)
train_data_std = train_data.std(axis=0)
x = (train_data - train_data_mean)/train_data_std
test_x = (test_data - train_data_mean)/train_data_std
y = train_labels

functions to test different methods for regression

In [152]:
def compute_loss(y, tx, model):
    err = y - (tx @ model)
    return (err.T @ err)/y.shape[0]

def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    
    Args:
        x: numpy array of shape (N,D)
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,D*d+1)
    """    
    N = x.shape[0]
    D = x.shape[1]
    #poly_base = np.zeros((N, D*degree + 1))
    poly_base = np.ones((N,1))
    poly_base = np.hstack((poly_base, x.copy()))
    for i in range(degree-1):
        range_start = 1 + i*D
        range_stop = 1 + (i+1)*D
        next_power = poly_base[:,range_start:range_stop]*x
        poly_base = np.hstack((poly_base, next_power))
    
    return poly_base

def cross_validation(train_model, degree):
    """
    Tests a certain training function using 4-fold cross-validation
    arguments: 
    train_model: func(y, tx) -> model
    returns: model, training_loss, test_loss
    """
    tx = build_poly(x, degree)
    k = 4
    k_indices = build_k_indices(train_y, k, seed)
    test_idx = k_indices[k-1]
    train_idx = (k_indices[:k-1]).flatten()
    model, train_loss = train_model(y[train_idx], tx[train_idx])
    test_loss = compute_loss(y[test_idx], tx[test_idx], model)
    return model, train_loss, test_loss

Graph stuff

In [167]:
import numpy as np
import matplotlib.pyplot as plt

def cross_validation_visualization(lambds, mse_tr, mse_te):
    """visualization the curves of mse_tr and mse_te."""
    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("r mse")
    #plt.xlim(1e-4, 1)
    plt.title("cross validation")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")
    
def cross_validation_explore_lambda(train_model, degree, lambdas):
    """cross validation over regularisation parameter lambda.
    
    Args:
        train_model: func(y, tx, lambda) -> model
        degree: integer, degree of the polynomial expansion
        lambdas: shape = (p, ) where p is the number of values of lambda to test
    Returns:
        best_lambda : scalar, value of the best lambda
        best_mse : scalar, the associated mean squared error for the best lambda
    """
    seed = 12
    degree = degree
    lambdas = lambdas
    # define lists to store the loss of training data and test data
    mse_tr = []
    mse_te = []
    
    best_idx = 0
    idx = 0
    for lambda_ in np.nditer(lambdas):
        _, te, tr = cross_validation(lambda y,tx: train_model(y,tx, lambda_), degree)
        mse_tr.append(tr)
        mse_te.append(te)
        if te < mse_te[best_idx]:
            best_idx = idx
        idx += 1
    
    best_lambda = lambdas[best_idx]
    best_mse = mse_te[best_idx]
        
    cross_validation_visualization(lambdas, mse_tr, mse_te)
    print("For polynomial expansion up to degree %.f, the choice of lambda which leads to the best test mse is %.5f with a test mse of %.3f" % (degree, best_lambda, best_mse))
    return best_lambda, best_mse

lambdas = np.logspace(-4, 0, 30)
#best_lambda, best_rmse = cross_validation_demo(7, 4, np.logspace(-4, 0, 30))

Now try different methods

In [153]:
cross_validation(least_squares,2)

(array([-4.45304550e+03,  8.64846646e-02, -2.43657221e-01, -2.41641346e-01,
         1.18422980e-01, -1.47201046e+04,  3.79406884e-02,  1.36063684e+03,
         2.68124417e-01, -1.35710250e-02, -2.57282077e+02, -1.87218726e-01,
         5.38064206e-02,  2.03773787e+04,  5.00541573e+01, -1.13227276e-03,
         1.92713269e-04,  4.93056675e+01,  4.96668781e-04,  9.14434997e-04,
         4.75368576e-02,  2.18275446e-03, -3.79968526e-02,  3.14001763e+04,
         1.94578616e+00, -1.63052605e+04, -2.37205201e+02,  3.06500002e-01,
        -2.16379340e+04,  9.58845737e+02,  2.17898803e+02,  3.78063163e-02,
         3.64114507e-02,  5.95306023e-03,  7.71002751e-03,  4.69456425e+03,
        -1.11906469e-02, -4.39827442e+02, -5.83697764e-02,  2.23967611e-03,
        -9.08764862e-03,  2.03170056e-02,  5.85812603e-02, -6.45995214e+03,
        -1.78836558e-02, -3.29401946e-02, -3.03005306e-03, -2.31037648e-02,
        -5.71582613e-02, -2.06890845e-03, -7.99635408e-03,  4.40089202e-04,
        -1.9

In [None]:
cross_validation_explore_lambda(lambda y,tx,lambda_: ridge_regression(y,tx,lambda_), 3, lambdas)

In [156]:
cross_validation(lambda y,tx: logistic_regression(y, tx, np.zeros(tx.shape[1]), 100, 0.001), 1)

(array([-8.05291911e-02,  2.17067256e-02, -3.22149173e-02, -1.26231262e-03,
         1.65392468e-02,  1.13124218e-02,  1.61151151e-02,  1.12084660e-02,
         2.09079318e-03, -2.08117141e-03,  1.24262748e-02, -1.82336961e-02,
         2.44234779e-02,  1.12837765e-02,  2.12926998e-02,  8.52438106e-06,
        -4.54075796e-04, -3.30284901e-03, -1.78764076e-06,  3.97694330e-04,
         1.50958797e-03,  6.91787073e-04,  1.08647655e-02,  1.05152289e-02,
         1.32004359e-02,  1.26264709e-02,  1.26253124e-02,  1.12036398e-02,
         1.12762249e-02,  1.12751909e-02,  1.05438203e-02]),
 0.5761583841141812,
 0.882165687969785)