In [4]:
#       keep the memory allocated for the machine to 2GB!

import numpy as np
from sklearn.neighbors import KNeighborsRegressor

def zero_one_error_labels(y, y_hat):
    return np.sum(y_hat == y)

def mse (y,y_hat):
    return np.mean(np.power(y-y_hat,2))


In [5]:
# load data
print("Loading training data ...")
data_train = np.genfromtxt("../../data/neighbors/train.csv", comments="#", delimiter=",")
Xtrain, ytrain = data_train[:,:-1], data_train[:,-1]
print("Loaded training data: n=%i, d=%i" % (Xtrain.shape[0], Xtrain.shape[1]))

# testing phase (apply model to a big test set!)
print("Loading testing data ...")
data_test = np.genfromtxt("../../data/neighbors/test.csv", comments="#", delimiter=",")
Xtest, ytest = data_test[:,:-1], data_test[:,-1]
print("Loaded testing data: n=%i, d=%i" % (Xtest.shape[0], Xtest.shape[1]))


Loading training data ...
Loaded training data: n=100000, d=15
Loading testing data ...
Loaded testing data: n=1000000, d=15


In [7]:
print("Loading validation data ...")
data_validation = np.genfromtxt("../../data/neighbors/validation.csv", comments="#", delimiter=",")
Xvd, yvd = data_validation[:,:-1], data_validation[:,-1]
print("Loaded validation data: n=%i, d=%i" % (Xvd.shape[0], Xvd.shape[1]))


Loading validation data ...
Loaded validation data: n=100000, d=15


In [8]:
# training phase
print("Fitting model ...")
# nearest neighbor regression model (DO NOT CHANGE PARAMETERS!)
model = KNeighborsRegressor(n_neighbors=10, algorithm="kd_tree")
%time model.fit(Xtrain, ytrain)
print("Model fitted!")

print("Applying model ...")
% time preds = model.predict(Xtest)

# output (here, 'preds' must be a list containing all predictions)
print("Predictions computed for %i patterns ...!" % len(preds))
print("Mean of predictions: %f" % np.mean(np.array(preds)))

Fitting model ...
CPU times: user 260 ms, sys: 0 ns, total: 260 ms
Wall time: 512 ms
Model fitted!
Applying model ...
CPU times: user 3min 41s, sys: 500 ms, total: 3min 42s
Wall time: 3min 45s
Predictions computed for 1000000 patterns ...!
Mean of predictions: 0.351215


In [9]:
print ("Mean Squared Error on test: %f" % mse(ytest,preds))

Mean Squared Error on test: 0.225451


In [10]:
# select only first 5 columns from the train data
Xtrain_5feat = data_train[:,0:5]

print("Loaded training data: n=%i, d=%i" % (Xtrain_5feat.shape[0], Xtrain_5feat.shape[1]))

# training phase
print("Fitting model ...")
%time model.fit(Xtrain_5feat, ytrain)
print("Model fitted!")

# testing phase (apply model to a big test set!)
Xtest_5feat = data_test[:,0:5]

print("Applying model ...")
%time preds_5feat = model.predict(Xtest_5feat)

# output (here, 'preds' must be a list containing all predictions)
print("Predictions computed for %i patterns ...!" % len(preds))
print("Mean of predictions: %f" % np.mean(np.array(preds)))

Loaded training data: n=100000, d=5
Fitting model ...
CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 100 ms
Model fitted!
Applying model ...
CPU times: user 30.6 s, sys: 356 ms, total: 31 s
Wall time: 31.5 s
Predictions computed for 1000000 patterns ...!
Mean of predictions: 0.351215


In [88]:
print("Mean Squared Error on test using only 5 features: %f" % mse(ytest,preds_5feat))

Mean Squared Error on test using only 5 features: 0.246566


In [57]:
# Select the best features by iteratively selecting good features
X_tr = data_train[:,:-1]
X_vd = data_validation[:,:-1]
n_feat = 15
n = X_tr.shape[1]
results = np.zeros([n_feat,n])
fs = []
for fi in range(n_feat):
    n_features = fi+1
    for i in range(n):
        if i in fs :
            continue
        # select features , train and predict, keep training error
        f = fs + [i]
        Xtr = X_tr[:,f]
        Xvd = X_vd[:,f]
        print("Fitting for columns %s ..." % str(f))
        %time model.fit(Xtr, ytrain)
        pred = model.predict(Xvd)
        results[n_features-1,i] = mse(yvalidation, pred)
    best = np.argsort(results,1)[n_features-1][fi]
    fs.append(best);

Fitting for columns [0] ...
CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 162 ms
Fitting for columns [1] ...
CPU times: user 140 ms, sys: 0 ns, total: 140 ms
Wall time: 144 ms
Fitting for columns [2] ...
CPU times: user 140 ms, sys: 0 ns, total: 140 ms
Wall time: 144 ms
Fitting for columns [3] ...
CPU times: user 180 ms, sys: 0 ns, total: 180 ms
Wall time: 188 ms
Fitting for columns [4] ...
CPU times: user 156 ms, sys: 0 ns, total: 156 ms
Wall time: 159 ms
Fitting for columns [5] ...
CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 161 ms
Fitting for columns [6] ...
CPU times: user 204 ms, sys: 0 ns, total: 204 ms
Wall time: 214 ms
Fitting for columns [7] ...
CPU times: user 168 ms, sys: 0 ns, total: 168 ms
Wall time: 168 ms
Fitting for columns [8] ...
CPU times: user 164 ms, sys: 0 ns, total: 164 ms
Wall time: 161 ms
Fitting for columns [9] ...
CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 163 ms
Fitting for columns [10] ...
CPU times: user 176 m

In [55]:
# test with the 5 best features
Xtr = X_tr[:,fs]
Xvd = X_vd[:,fs]
print("Fitting for columns %s ..." % str(fs))
model.fit(Xtr, ytrain)
pred = model.predict(Xvd)
print("Best MSE is %f ..." % mse(yvalidation, pred))

Fitting for columns [9, 0, 2, 6, 8] ...
Best MSE is 0.187799 ...


In [58]:
np.argsort(results,1)

array([[ 9,  8, 13, 14, 12,  7,  4,  3, 11,  6,  2,  1,  5, 10,  0],
       [ 9,  0,  5,  1,  6, 10, 11,  3,  2,  7,  8, 12,  4, 13, 14],
       [ 0,  9,  2,  1,  7, 12,  6, 11,  3,  8, 13,  4,  5, 10, 14],
       [ 0,  2,  9,  6, 11,  1,  8,  3, 13, 12,  7,  4,  5, 14, 10],
       [ 0,  2,  6,  9,  8, 13,  3,  7, 12,  1, 11,  4,  5, 14, 10],
       [ 0,  2,  6,  8,  9, 12,  4,  7, 11,  1,  3, 13,  5, 14, 10],
       [ 0,  2,  6,  8,  9, 12, 11,  1,  4,  7,  3, 13, 14,  5, 10],
       [ 0,  2,  6,  8,  9, 11, 12,  1,  7,  4,  3, 13,  5, 14, 10],
       [ 0,  1,  2,  6,  8,  9, 11, 12,  7,  4, 13,  3,  5, 14, 10],
       [ 0,  1,  2,  6,  7,  8,  9, 11, 12,  4, 13,  3,  5, 14, 10],
       [ 0,  1,  2,  4,  6,  7,  8,  9, 11, 12, 13,  3,  5, 14, 10],
       [ 0,  1,  2,  4,  6,  7,  8,  9, 11, 12, 13,  3,  5, 14, 10],
       [ 0,  1,  2,  3,  4,  6,  7,  8,  9, 11, 12, 13,  5, 14, 10],
       [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 10],
       [ 0,  1,  2,  3,  4,  5,  6