In [6]:
%%HTML
## Get Started

<p>Let's import a few packages that you will need. You will work with the <a href="https://archive.ics.uci.edu/ml/datasets/Ionosphere">ION</a> dataset for this project.</p> 

In [7]:
import numpy as np
from pylab import *
from numpy.matlib import repmat
import matplotlib.pyplot as plt
from scipy.io import loadmat
import time

%matplotlib notebook

from helper import *

print('You\'re running python %s' % sys.version.split(' ')[0])

You're running python 3.8.0


In [8]:
data = loadmat("ion.mat")
xTr  = data['xTr'].T
yTr  = data['yTr'].flatten()
xTe  = data['xTe'].T
yTe  = data['yTe'].flatten()

FileNotFoundError: [Errno 2] No such file or directory: 'ion.mat'

In [None]:
# Create a regression tree with no restriction on its depth. 
# This is equivalent to what you implemented in the previous project
# if you want to create a tree of max depth k
# then call h.RegressionTree(depth=k)
tree = RegressionTree(depth=np.inf)

# To fit/train the regression tree
tree.fit(xTr, yTr)

# To use the trained regression tree to make prediction
pred = tree.predict(xTr)

In [2]:
We have also created a square loss function that takes in the prediction <code>pred</code> and ground truth <code>truth</code> and returns the average square loss between prediction and ground truth. 

SyntaxError: invalid syntax (<ipython-input-2-a0eb7084ff46>, line 1)

In [None]:
def square_loss(pred, truth):
    return np.mean((pred - truth)**2)

In [None]:
Now, look at the performance of your tree on both the training set and test set using the code cell below.

In [None]:
print('Training Loss: {:.4f}'.format(square_loss(tree.predict(xTr), yTr)))
print('Test Loss: {:.4f}'.format(square_loss(tree.predict(xTe), yTe)))

In [3]:
## Implement Cross Validation
%%HTML
<p1>As you can see, your tree achives zero training loss on the training set but not zero test loss. Clearly, the model is overfitting! To reduce overfitting, you need to control the depth of the tree. One way to pick the optimal depth is to do kFold Cross Validation. To do so, you will first implement <code>grid_search</code>, which finds the best depths given a training set and validation set. Then you will implement <code>generate_kFold</code>, which generates the split for kFold cross validation. Finally, you will combine the two functions to implement <code>cross_validation</code></p1>.

SyntaxError: invalid syntax (<ipython-input-3-542c193e9d17>, line 3)

In [3]:
%%HTML
<h3>Part One: Implement <code>grid_search</code>[Graded]</h3>

Implement the function <code>grid_search</code>, which takes in a training set <code>xTr, yTr</code>, a validation set <code>xVal, yVal</code> and a list of tree depth candidates <code>depths</code>. Your job here is to fit a regression tree for each depth candidate on the training set <code>xTr, yTr</code>, evaluate the fitted tree on the validation set <code>xVal, yVal</code> and then pick the candidate that yields the lowest loss for the validation set. Note: in the event of a tie, return the smallest depth candidate.

In [None]:
def grid_search(xTr, yTr, xVal, yVal, depths):
    '''
    Input:
        xTr: nxd matrix
        yTr: n vector
        xVal: mxd matrix
        yVal: m vector
        depths: a list of len k
    Return:
        best_depth: the depth that yields that lowest loss on the validation set
        training losses: a list of len k. the i-th entry corresponds to the the training loss
                the tree of depths[i]
        validation_losses: a list of len k. the i-th entry corresponds to the the validation loss
                the tree of depths[i]
    '''
    training_losses = []
    validation_losses = []
    best_depth = None
    
    # YOUR CODE HERE
    for i in depths:
        treeGrid = RegressionTree(depth=i)
        treeGrid.fit(xTr,yTr)
        predTr = treeGrid.predict(xTr)
        predVal= treeGrid.predict(xVal)
        training_losses.append(square_loss(predTr,yTr)) #square_loss(predTr,yTr)
        validation_losses.append(square_loss(predVal,yVal))
    
    #print('yTr: ', yTr.shape)
    
    #print('traininglossmin: ', training_losses.index(min(training_losses)))
    #print('validationlossmin: ', validation_losses.index(min(validation_losses)))
    #print('fit: ', fit)
    
    best_depth = depths[validation_losses.index(min(validation_losses))]
    
    #print('best: ', best_depth)
    return best_depth, training_losses, validation_losses

#depths = [1,2,3,4,5]
#k = len(depths)
#best_depth, training_losses, validation_losses = grid_search(xTr, yTr, xTe, yTe, depths)

In [None]:
# The following tests check that your implementation of grid search returns the correct number of training and validation loss values and the correct best depth

depths = [1,2,3,4,5]
k = len(depths)
best_depth, training_losses, validation_losses = grid_search(xTr, yTr, xTe, yTe, depths)
best_depth_grader, training_losses_grader, validation_losses_grader = grid_search_grader(xTr, yTr, xTe, yTe, depths)

# Check the length of the training loss
def grid_search_test1():
    return (len(training_losses) == k) 

# Check the length of the validation loss
def grid_search_test2():
    return (len(validation_losses) == k)

# Check the argmin
def grid_search_test3():
    return (best_depth == depths[np.argmin(validation_losses)])

def grid_search_test4():
    return (best_depth == best_depth_grader)

def grid_search_test5():
    return np.linalg.norm(np.array(training_losses) - np.array(training_losses_grader)) < 1e-7

def grid_search_test6():
    return np.linalg.norm(np.array(validation_losses) - np.array(validation_losses_grader)) < 1e-7

runtest(grid_search_test1, 'grid_search_test1')
runtest(grid_search_test2, 'grid_search_test2')
runtest(grid_search_test3, 'grid_search_test3')
runtest(grid_search_test4, 'grid_search_test4')
runtest(grid_search_test5, 'grid_search_test5')
runtest(grid_search_test6, 'grid_search_test6')

In [5]:
%%HTML
<h3>Part Two: Implement <code>generate_kFold</code>[Graded]</h3>

Now, implement the <code>generate_kFold</code> function, which takes in the number of training examples <code>n</code> and the number of folds <code>k</code> and returns a list of <code>k</code> folds where each fold takes the form <code>(training indices, validation indices)</code>.

For instance, if n = 3 and k = 3, then we have three indices <code>[0,1,2]</code> and we are trying to split it k=3 times to obtain different training/validation splits. 
One possible output of the the function is <code>[([0, 1], [2]), ([1, 2], [0]), ([0, 2], [1])]</code> 

In [None]:
def generate_kFold(n, k):
    '''
    Input:
        n: number of training examples
        k: number of folds
    Returns:
        kfold_indices: a list of len k. Each entry takes the form
        (training indices, validation indices)
    '''
    assert k >= 2
    kfold_indices = []
    
    # YOUR CODE HERE
    for i in range(k):
        instances = np.arange(n)
        data= np.random.permutation(instances)
        kfold_indices.append((data[int((n/k)):],data[:int((n/k))]))
        
        
        
    t = [((len(train_indices) , len(test_indices))) 
        for (train_indices, test_indices) in kfold_indices]

    ratio_test = []
    for (train_indices, validation_indices) in kfold_indices:
        ratio = len(validation_indices) / len(train_indices)
        #print('ratio: ', ratio)
        ratio_test.append((ratio > 0.24 and ratio < 0.26))
    
    train_indices_set = set() # to keep track of training indices for each fold
    validation_indices_set = set() # to keep track of validation indices for each fold
    for (train_indices, validation_indices) in kfold_indices:
        train_indices_set = train_indices_set.union(set(train_indices))
        validation_indices_set = validation_indices_set.union(set(validation_indices))
        
        
    #print('train_indices_set: ', train_indices_set)
    #print('val_idnices_set:' , validation_indices_set)
    #print('ratio_test: ', ratio_test)
    #print("t: ", t)
    #print('lenTrain: ', len(train_indices))
    #print('kfolds: ', kfold_indices[0])
    
    #assert train_indices_set == set(np.arange(1004)) 
    #assert validation_indices_set == set(np.arange(1004))
    #return None
    return kfold_indices

#generate_kFold(1004,5)
#runtest(generate_kFold_test2, 'generate_kFold_test2')

In [None]:
# The following tests check that your generate_kFold function 
# returns the correct number of total indices, 
# train and test indices, and the correct ratio

kfold_indices = generate_kFold(1004, 5)

def generate_kFold_test1():
    return len(kfold_indices) == 5 # you should gener   ate 5 folds

def generate_kFold_test2():
    t = [((len(train_indices) + len(test_indices)) == 1004) 
         for (train_indices, test_indices) in kfold_indices]
    return np.all(t) # make sure that both for each fold, the number of examples sum up to 1004

def generate_kFold_test3():
    ratio_test = []
    for (train_indices, validation_indices) in kfold_indices:
        ratio = len(validation_indices) / len(train_indices)
        ratio_test.append((ratio > 0.24 and ratio < 0.26))
    # make sure that both for each fold, the training to validation 
    # examples ratio is in between 0.24 and 0.25
    return np.all(ratio_test) 

def generate_kFold_test4():
    train_indices_set = set() # to keep track of training indices for each fold
    validation_indices_set = set() # to keep track of validation indices for each fold
    for (train_indices, validation_indices) in kfold_indices:
        train_indices_set = train_indices_set.union(set(train_indices))
        validation_indices_set = validation_indices_set.union(set(validation_indices))
    
    # Make sure that you use all the examples in all the training fold and validation fold
    return train_indices_set == set(np.arange(1004)) and validation_indices_set == set(np.arange(1004))


runtest(generate_kFold_test1, 'generate_kFold_test1')
runtest(generate_kFold_test2, 'generate_kFold_test2')
runtest(generate_kFold_test3, 'generate_kFold_test3')
runtest(generate_kFold_test4, 'generate_kFold_test4')

In [4]:
%%HTML

<h3>Part Three: Implement <code>cross_validation</code>[Graded]</h3>

Use <code>grid_search</code> to implement the <code>cross_validation</code> function that takes in the training set <code>xTr, yTr</code>, a list of depth candidates <code>depths</code> and performs K-fold cross validation on the training set. We use <code>generate_kFold</code> to generate the K training/validation split. Using <code>indices</code>, the function will do a grid search  on each fold and return the parameter that yields the best average validation loss across the folds. Note that in event of tie, the function should return the smallest depth candidate.

In [None]:
def cross_validation(xTr, yTr, depths, indices):
    '''
    Input:
        xTr: nxd matrix (training data)
        yTr: n vector (training data)
        depths: a list (of length l) depths to be tried out
        indices: indices from generate_kFold
    Returns:
        best_depth: the best parameter 
        training losses: a list of lenth l. the i-th entry corresponds to the the average training loss
                the tree of depths[i]
        validation_losses: a list of length l. the i-th entry corresponds to the the average validation loss
                the tree of depths[i] 
    '''
    training_losses = []
    validation_losses = []
    best_depth = None
    
    # YOUR CODE HERE
    '''
    for i in range(k):
        
    
        xValList = [xTr[i] for i in indices[i-1][0]]
        yValList = [yTr[i] for i in indices[i-1][0]]
        xVal = np.asarray(xValList)
        yVal = np.asarray(yValList)
        #yVal = yTr[indices[1]]
    
    
        best_depth, training_losses, validation_losses = grid_search(xTr, yTr,xVal, yVal, depths)
    
    #print('len xVal: ', len(xVal))
    #print('len yVal: ', len(yVal))
    #use the values of the indices to create your validation and training data set splits. Then pass them to the grid_search function
    
    #print('indices: ', len(indices[0][1]))
    #print('xVal: ', xValList)
    #print('yVal: ', yVal)
    print('best_depth:', best_depth)
    print('best_grader:' , best_depth_grader)
    print('training_losses', training_losses)
    print('training_grader: ', training_losses_grader)
    
    
    #return None
    '''
    
    for i,j in indices:
        xtrain, ytrain = xTr[i], yTr[i]
        xVal, yVal = xTr[j], yTr[j]
    
        #xValList = [xTr[i] for i in indices[i-1][0]]
        #yValList = [yTr[i] for i in indices[i-1][0]]
        #xVal = np.asarray(xValList)
        #yVal = np.asarray(yValList)
        #yVal = yTr[indices[1]]
    
    
        best_depth, training_losses, validation_losses = grid_search(xtrain, ytrain,xVal, yVal, depths)
    
        #print('len xVal: ', len(xVal))
        #print('len yVal: ', len(yVal))
        #use the values of the indices to create your validation and training data set splits. Then pass them to the grid_search function
    
        #print('indices: ', len(indices[0][1]))
        #print('xVal: ', xValList)
        #print('yVal: ', yVal)
        print('best_depth:', best_depth)
        print('best_grader:' , best_depth_grader)
        print('training_losses', training_losses)
        print('training_grader: ', training_losses_grader)
    
    #training_losses = np.mean(training_losses)
    #validation_losses = np.mean(validation_losses)
    
    training_losses = np.mean(training_losses, axis=0)
    validation_losses = np.mean(validation_losses, axis=0)
    best_depth = depths[np.argmin(validation_losses)]
    
    #return None
    
    return best_depth, training_losses, validation_losses

#depths = [1,2,3,4]
#k = len(depths)

# generate indices
# the same indices will be used to cross check your solution and ours
#indices = generate_kFold(len(xTr), 5)
#best_depth, training_losses, validation_losses = cross_validation(xTr, yTr, depths, indices)

In [None]:
# The following tests check that your implementation of cross_validation returns the correct number of training and validation losses, the correct "best depth" and the correct values for training and validation loss

depths = [1,2,3,4]
k = len(depths)

# generate indices
# the same indices will be used to cross check your solution and ours
indices = generate_kFold(len(xTr), 5)
best_depth, training_losses, validation_losses = cross_validation(xTr, yTr, depths, indices)
best_depth_grader, training_losses_grader, validation_losses_grader = cross_validation_grader(xTr, yTr, depths, indices)

# Check the length of the training loss
def cross_validation_test1():
    return (len(training_losses) == k) 

# Check the length of the validation loss
def cross_validation_test2():
    return (len(validation_losses) == k)

# Check the argmin
def cross_validation_test3():
    return (best_depth == depths[np.argmin(validation_losses)])

def cross_validation_test4():
    return (best_depth == best_depth_grader)

def cross_validation_test5():
    return np.linalg.norm(np.array(training_losses) - np.array(training_losses_grader)) < 1e-7

def cross_validation_test6():
    return np.linalg.norm(np.array(validation_losses) - np.array(validation_losses_grader)) < 1e-7

runtest(cross_validation_test1, 'cross_validation_test1')
runtest(cross_validation_test2, 'cross_validation_test2')
runtest(cross_validation_test3, 'cross_validation_test3')
runtest(cross_validation_test4, 'cross_validation_test4')
runtest(cross_validation_test5, 'cross_validation_test5')
runtest(cross_validation_test6, 'cross_validation_test6')