In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold

boston=load_boston
X_data,t=load_boston(return_X_y=True) 
# split the data into training and validation set   
X_train, X_valid, t_train, t_valid = train_test_split(X_data,t,test_size=1/5, random_state=10) 

In [22]:

# function of getting the predication y; Input = X, t; Output = y, w
def getRMSE(X_train,X_valid,t_train,t_valid, arg):
    Xtr = getTrainingSet(X_train, arg)    # add a column of one
    A = np.dot(Xtr.T,Xtr)
    A1 = np.linalg.inv(A) #the inverse of A
    B = np.dot(Xtr.T,t_train)
    w =np.dot(A1,B)
    y = np.dot(Xtr,w)
    
    # prepare the validation set
    new_col_1 = np.ones(len(X_valid))
    Xva = np.column_stack((new_col_1, X_valid))
    y_valid = np.dot(Xva,w)
    diff_valid = np.subtract(t_valid,y_valid)
    err_valid = np.dot(diff_valid, diff_valid)/len(X_valid)
    RMSE_valid = np.sqrt(err_valid)

    return RMSE_valid

def getTrainingSet(X, arg):
    size=X.shape
    Xtr=np.ones( (size[0],1) )

    if (arg == 0):
        if (X.ndim ==1):
            Xtr=np.column_stack((Xtr, X))
        else: 
            for j in range(size[1]):
                Xtr=np.column_stack((Xtr, X[:,j]))
    
    return Xtr
 
    


In [23]:
def kFold(splits, X, t, arg):
    res = 0
    kf=KFold(n_splits=splits)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        t_train, t_test = t[train_index], t[test_index]

        res = res + getRMSE(X_train,X_test,t_train,t_test, arg)
  
    error = res/splits

    return error    

In [24]:

def main (X_train, X_valid, t_train, t_valid, K, arg):

    kfError=np.zeros((K,1))
    tError=np.zeros((K,1))
    error = np.zeros((K,1))
    existingCol = np.arange(K)

    for i in range(K):
        
        #print(existingCol)
        if (i==0):
            for j in existingCol:

                X_train_temp=X_train[:,j]  # only select a column for the first run  
                error[j] = kFold(5, X_train_temp, t_train, arg)

            S = X_train[:,np.argmin(error)]
            S_valid = X_valid[:,np.argmin(error)]

            kfError[i] = error[np.argmin(error)]
            tError[i] = getRMSE(S,S_valid,t_train,t_valid, arg)

            existingCol_new =np.delete(existingCol,np.where(existingCol == np.argmin(error) ))
            existingCol = existingCol_new
            error = np.full((K,1),np.inf)

        else:
            for j in existingCol:
                
                X_train_temp = np.column_stack( (S,np.reshape(X_train[:,j], (len(X_train), 1))) ) #######
                error[j] = kFold(5, X_train_temp, t_train, arg)
    
            S = np.column_stack( (S,  np.reshape(X_train[:,np.argmin(error)], (len(X_train), 1))) )
            S_valid = np.column_stack( (S_valid,  np.reshape(X_valid[:,np.argmin(error)], (len(X_valid), 1))) )

            kfError[i] = error[np.argmin(error)]
            tError[i] = getRMSE(S,S_valid,t_train,t_valid, arg)

            existingCol_new =np.delete(existingCol,np.where(existingCol == np.argmin(error) ))
            existingCol = existingCol_new
            error = np.full((K,1),np.inf)

    return kfError, tError

K=13  
main (X_train, X_valid, t_train, t_valid, K, 0)


(array([[5.79457118],
        [5.32039862],
        [5.02037302],
        [4.88454129],
        [4.78847823],
        [4.71784658],
        [4.68841444],
        [4.65304816],
        [4.62480233],
        [4.55233397],
        [4.55831096],
        [4.5665048 ],
        [4.59907204]]),
 array([[7.5875242 ],
        [6.43444041],
        [5.99092801],
        [6.11800996],
        [5.99629504],
        [5.81357015],
        [5.92447036],
        [5.9990489 ],
        [5.94718833],
        [5.91246905],
        [5.81772645],
        [5.81980501],
        [5.866342  ]]))