In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold

boston=load_boston
X_data,t=load_boston(return_X_y=True) 
# split the data into training and validation set   
X_train, X_valid, t_train, t_valid = train_test_split(X_data,t,test_size=1/5, random_state=10) 

In [12]:
N = len(X_train)
M = len(X_valid)


# function of getting the predication y; Input = X, t; Output = y, w
def getRMSE(X_train,X_valid,t_train,t_valid):
    new_col = np.ones(len(X_train))
    Xtr = np.column_stack((new_col,X_train))    # add a column of one
    A = np.dot(Xtr.T,Xtr)
    A1 = np.linalg.inv(A) #the inverse of A
    B = np.dot(Xtr.T,t_train)
    w =np.dot(A1,B)
    y = np.dot(Xtr,w)
    
    # prepare the validation set
    new_col_1 = np.ones(len(X_valid))
    Xva = np.column_stack((new_col_1, X_valid))
    y_valid = np.dot(Xva,w)
    diff_valid = np.subtract(t_valid,y_valid)
    err_valid = np.dot(diff_valid, diff_valid)/len(X_valid)
    RMSE_valid = np.sqrt(err_valid)

    return RMSE_valid


In [13]:
def kNN(K, X_train_one, X_valid_one, t_train, t_valid):

    N = len(X_train)
    M = len(X_valid)
    #compute distances from validation points to training points
    dist_one = np.zeros((M,N))

    for i in range(M):
        for j in range(N):
            z = (X_valid_one[i]-X_train_one[j])
            dist_one[i,j] = z*z
    #sort distances for each x_i
    ind_one = np.argsort(dist_one, axis=1) # dist(i,ind[k]) is the kth smallest distance to x_i
    #implement kNN with one feature

    y_one = np.zeros(M) # row k stores predictions made by (k+1)-NN
    err_one = np.zeros(K)

    for i in range(M):
        #compute prediction for x_i
        for s in range(K):
             y_one[i] += t_train[ind_one[i,s]]
        y_one[i] /= (K+1)
        # compute error for this k
    z = y_one- t_valid
    err_one= np.dot(z,z)/M
    RMSE_one = np.sqrt(err_one)

    return RMSE_one

In [14]:
def kFold(splits, X, t):
    res = 0
    kf=KFold(n_splits=splits)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        t_train, t_test = t[train_index], t[test_index]

        res = res + getRMSE(X_train,X_test,t_train,t_test)
  
    error = res/splits

    return error    

In [15]:
K=13
S = np.ones((N,1))  # dummy ones
S_index = []    # store the orginal column number
error = np.zeros((K,1))
existingCol = np.arange(K)

for i in range(K):
    #print(existingCol)
    if (i==0):
        for j in existingCol:
            X_train_temp=X_train[:,j]  # only select a column for the first run  
            error[j] = kFold(5, X_train_temp, t_train)
            S = X_train[:,np.argmin(error)]

        existingCol_new =np.delete(existingCol,np.where(existingCol == np.argmin(error) ))
        existingCol = existingCol_new
       

    elif (i==1):
        for j in existingCol:    
            X_train_temp = np.column_stack((S, X_train[:,j]))
         
            error[j] = kFold(5, X_train_temp, t_train)

        S = np.column_stack((S,X_train[:,np.argmin(error)]))
        existingCol_new =np.delete(existingCol,np.where(existingCol == np.argmin(error) ))
        existingCol = existingCol_new

    else:
        for j in existingCol:
          
            X_train_temp = np.hstack( (S,np.reshape(X_train[:,j], (len(X_train), 1))) ) #######


            error[j] = kFold(5, X_train_temp, t_train)

        print(error)
        S = np.hstack( (S,  np.reshape(X_train[:,np.argmin(error)], (len(X_train), 1))) )
        existingCol_new =np.delete(existingCol,np.where(existingCol == np.argmin(error) ))
        existingCol = existingCol_new

print(S.shape)
        



        





[[5.24242528]
 [5.29950209]
 [5.33419043]
 [5.28330726]
 [5.34084303]
 [5.32039862]
 [5.34382244]
 [5.28763   ]
 [5.32530521]
 [5.27336739]
 [5.02037302]
 [5.17401269]
 [5.79457118]]
[[4.9818628 ]
 [5.03972185]
 [5.04847953]
 [5.01030386]
 [5.0264961 ]
 [5.32039862]
 [5.03762804]
 [4.95851118]
 [5.06314443]
 [5.04532319]
 [5.02037302]
 [4.88454129]
 [5.79457118]]
[[4.89570675]
 [4.90307311]
 [4.90503734]
 [4.8820218 ]
 [4.90292127]
 [5.32039862]
 [4.89793316]
 [4.78847823]
 [4.90534308]
 [4.91773015]
 [5.02037302]
 [4.88454129]
 [5.79457118]]
[[4.77395286]
 [4.7646271 ]
 [4.77096031]
 [4.79473092]
 [4.71784658]
 [5.32039862]
 [4.81604554]
 [4.78847823]
 [4.81172297]
 [4.80553853]
 [5.02037302]
 [4.88454129]
 [5.79457118]]
[[4.71008077]
 [4.68841444]
 [4.72789895]
 [4.72106765]
 [4.71784658]
 [5.32039862]
 [4.75310589]
 [4.78847823]
 [4.71723786]
 [4.74565143]
 [5.02037302]
 [4.88454129]
 [5.79457118]]
[[4.65304816]
 [4.68841444]
 [4.69581932]
 [4.67802004]
 [4.71784658]
 [5.32039862]
 

LinAlgError: Singular matrix