In [2]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

def zero_one_error_labels(y, y_hat):
    return np.sum(y_hat == y)

def mse (y,y_hat):
    return np.mean(np.power(y-y_hat,2))

In [3]:
# load data
print("Loading training data ...")
data_train = np.genfromtxt("data/neighbors/train.csv", comments="#", delimiter=",")
Xtrain, ytrain = data_train[:,:-1], data_train[:,-1]
print("Loaded training data: n=%i, d=%i" % (Xtrain.shape[0], Xtrain.shape[1]))

# testing phase (apply model to a big test set!)
print("Loading testing data ...")
data_test = np.genfromtxt("data/neighbors/test.csv", comments="#", delimiter=",")
Xtest, ytest = data_test[:,:-1], data_test[:,-1]
print("Loaded testing data: n=%i, d=%i" % (Xtest.shape[0], Xtest.shape[1]))

print("Loading validation data ...")
data_validation = np.genfromtxt("data/neighbors/validation.csv", comments="#", delimiter=",")
Xvd, yvd = data_validation[:,:-1], data_validation[:,-1]
print("Loaded validation data: n=%i, d=%i" % (Xvd.shape[0], Xvd.shape[1]))

Loading training data ...
Loaded training data: n=100000, d=15
Loading testing data ...
Loaded testing data: n=1000000, d=15
Loading validation data ...
Loaded validation data: n=100000, d=15


In [4]:
# training phase
print("Fitting model ...")
model = KNeighborsRegressor(n_neighbors=10, algorithm="kd_tree",n_jobs=-1)
%time model.fit(Xtrain, ytrain)
print("Model fitted!")

print("Applying model ...")
% time preds = model.predict(Xtest)

# output (here, 'preds' must be a list containing all predictions)
print ("Mean Squared Error on test: %f" % mse(ytest,preds))

Fitting model ...
CPU times: user 168 ms, sys: 12 ms, total: 180 ms
Wall time: 301 ms
Model fitted!
Applying model ...
CPU times: user 3min 19s, sys: 1.77 s, total: 3min 21s
Wall time: 51.4 s
Mean Squared Error on test: 0.225451


In [9]:
from copy import copy

# Select the best features by iteratively removing bad features
X_tr = data_train[:,:-1]
X_vd = data_validation[:,:-1]
n_feat = 5
n = X_tr.shape[1]
results = np.zeros([n-n_feat,n])
fs = range(n)
epoch=0
while len(fs) > n_feat:
    epoch +=1
    for i in range(n):
        if i not in fs :
            continue
        f = copy(fs);f.remove(i)
        Xtr = X_tr[:,f]
        Xvd = X_vd[:,f]
        print("Fitting for columns %s ..." % str(f))
        %time model.fit(Xtr, ytrain)
        pred = model.predict(Xvd)
        results[epoch-1,i] = mse(yvd, pred)
    worst = np.argsort(results[epoch-1,:])[epoch-1]
    print("Found lowest MSE after removing %i ..." % worst)
    fs.remove(worst)

Fitting for columns [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 168 ms, sys: 4 ms, total: 172 ms
Wall time: 171 ms
Fitting for columns [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 164 ms, sys: 0 ns, total: 164 ms
Wall time: 164 ms
Fitting for columns [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 132 ms, sys: 0 ns, total: 132 ms
Wall time: 132 ms
Fitting for columns [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 163 ms
Fitting for columns [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 164 ms, sys: 0 ns, total: 164 ms
Wall time: 162 ms
Fitting for columns [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 172 ms, sys: 0 ns, total: 172 ms
Wall time: 173 ms
Fitting for columns [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 168 ms, sys: 0 ns, total: 168 ms
Wall time: 168 ms
Fitting for columns [0, 1, 

In [10]:
print(np.argsort(results))

[[10 14  5  0  9 13  3  7 12  8  1  4  6 11  2]
 [10 14  0  5  6 12  3  1 11  9  8  7 13  2  4]
 [10 14  5  0  8  6  1  9  3  4 13 12  7  2 11]
 [ 5 10 14  6 12  1  7  8 11  4  3  9 13  2  0]
 [ 5  6 10 14 13  7  9  8 12  3  4  2 11  1  0]
 [ 5  6 10 13 14  3 12  7  9  2  4  8 11  1  0]
 [ 3  5  6 10 13 14 12  9  4  7  2 11  1  8  0]
 [ 3  5  6 10 12 13 14  4  9  7 11  2  1  8  0]
 [ 3  4  5  6 10 12 13 14 11  7  1  2  8  9  0]
 [ 3  4  5  6 10 11 12 13 14  7  2  8  9  1  0]]


In [16]:
# test with the 5 best features
Xtr = X_tr[:,fs]
Xvd = X_vd[:,fs]
X_test = Xtest[:,fs]

print("Fitting for columns %s for validation..." % str(fs))
model.fit(Xtr, ytrain)
%time pred = model.predict(Xvd)
print("Best MSE for Validation is %f ..." % mse(yvd, pred))

print("Fitting for columns %s for test..." % str(fs))
%time pred = model.predict(X_test)
print("Best MSE for test is %f ..." % mse(ytest, pred))

Fitting for columns [0, 1, 2, 8, 9] for validation...
CPU times: user 2.44 s, sys: 4 ms, total: 2.45 s
Wall time: 721 ms
Best MSE for Validation is 0.188903 ...
Fitting for columns [0, 1, 2, 8, 9] for test...
CPU times: user 26 s, sys: 64 ms, total: 26.1 s
Wall time: 6.87 s
Best MSE for test is 0.223736 ...
