In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

def zero_one_error_labels(y, y_hat):
    return np.sum(y_hat == y)

def mse (y,y_hat):
    return np.mean(np.power(y-y_hat,2))

In [4]:
# load data
print("Loading training data ...")
data_train = np.genfromtxt("data/neighbors/train.csv", comments="#", delimiter=",")
Xtrain, ytrain = data_train[:,:-1], data_train[:,-1]
print("Loaded training data: n=%i, d=%i" % (Xtrain.shape[0], Xtrain.shape[1]))

# testing phase (apply model to a big test set!)
print("Loading testing data ...")
data_test = np.genfromtxt("data/neighbors/test.csv", comments="#", delimiter=",")
Xtest, ytest = data_test[:,:-1], data_test[:,-1]
print("Loaded testing data: n=%i, d=%i" % (Xtest.shape[0], Xtest.shape[1]))

print("Loading validation data ...")
data_validation = np.genfromtxt("data/neighbors/validation.csv", comments="#", delimiter=",")
Xvd, yvd = data_validation[:,:-1], data_validation[:,-1]
print("Loaded validation data: n=%i, d=%i" % (Xvd.shape[0], Xvd.shape[1]))

Loading training data ...
Loaded training data: n=100000, d=15
Loading testing data ...
Loaded testing data: n=1000000, d=15
Loading validation data ...
Loaded validation data: n=100000, d=15


In [6]:
# training phase
print("Fitting model ...")
model = KNeighborsRegressor(n_neighbors=10, algorithm="kd_tree",n_jobs=-1)
%time model.fit(Xtrain, ytrain)
print("Model fitted!")

print("Applying model ...")
% time preds = model.predict(Xtest)

# output (here, 'preds' must be a list containing all predictions)
print ("Mean Squared Error on test: %f" % mse(ytest,preds))

Fitting model ...
CPU times: user 148 ms, sys: 0 ns, total: 148 ms
Wall time: 147 ms
Model fitted!
Applying model ...
CPU times: user 3min 19s, sys: 5.91 s, total: 3min 25s
Wall time: 52.3 s
Mean Squared Error on test: 0.225451


In [52]:
# Select the best features by iteratively removing bad features
X_tr = data_train[:,:-1]
X_vd = data_validation[:,:-1]
n_feat = 5
n = X_tr.shape[1]
results = np.zeros([n-n_feat,n])
fs = range(n)
epoch=0
while len(fs) > n_feat:
    epoch +=1
    for i in range(n):
        if i not in fs :
            continue
        f = copy(fs);f.remove(i)
        Xtr = X_tr[:,f]
        Xvd = X_vd[:,f]
        print("Fitting for columns %s ..." % str(f))
        %time model.fit(Xtr, ytrain)
        pred = model.predict(Xvd)
        results[epoch-1,i] = mse(yvd, pred)
    worst = np.argsort(results[epoch-1,:])[-1]
    print("Worst column was %i , removing it ..." % worst)
    fs.remove(worst)

Fitting for columns [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 128 ms, sys: 0 ns, total: 128 ms
Wall time: 126 ms
Fitting for columns [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 122 ms
Fitting for columns [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 116 ms, sys: 0 ns, total: 116 ms
Wall time: 119 ms
Fitting for columns [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 118 ms
Fitting for columns [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 121 ms
Fitting for columns [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 124 ms, sys: 0 ns, total: 124 ms
Wall time: 126 ms
Fitting for columns [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14] ...
CPU times: user 124 ms, sys: 0 ns, total: 124 ms
Wall time: 124 ms
Fitting for columns [0, 1, 

In [59]:
print(np.argsort(results))

[[10 14  5  0  9 13  3  7 12  8  1  4  6 11  2]
 [ 2 10 14  5  0  9  8 13  3  4  7 12  6  1 11]
 [ 2 11 10 14  0  5 13  9  8  4  3  7 12  6  1]
 [ 1  2 11 10 14  0  5 13  9  8  4  3  7 12  6]
 [ 1  2  6 11 10 14  5  0 13  8  3  9  4  7 12]
 [ 1  2  6 11 12 10 14  5  0  8  3  9 13  4  7]
 [ 1  2  6  7 11 12 10 14  0  5  9  4  8 13  3]
 [ 1  2  3  6  7 11 12 10  0  5 14  9 13  8  4]
 [ 1  2  3  4  6  7 11 12 10 14  5  0 13  8  9]
 [ 1  2  3  4  6  7  9 11 12 10 13  5  8  0 14]]


In [56]:
# test with the 5 best features
Xtr = X_tr[:,fs]
Xvd = X_vd[:,fs]
print("Fitting for columns %s ..." % str(fs))
model.fit(Xtr, ytrain)
%time pred = model.predict(Xvd)
print("Best MSE is %f ..." % mse(yvd, pred))

Fitting for columns [0, 5, 8, 10, 13] ...
CPU times: user 2.88 s, sys: 4 ms, total: 2.88 s
Wall time: 822 ms
Best MSE is 0.244051 ...
