In [None]:
import numpy as np

np.set_printoptions(formatter={'float_kind':'{:f}'.format})
NANVAL = -998
TRAIN = './data/train.csv'
TEST = './data/test.csv'

# Nouvelle section

In [None]:
def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

In [None]:
y_train, tx_train, ids_train = load_csv_data(TRAIN)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ids = x[:, 0].astype(np.int)


In [None]:
tx_train.shape

(250000, 30)

In [None]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x


In [None]:
def preprocessing(y, tx, strategy, std=True, outliers = False):
    #TODO : outliers
    res_x = tx
    res_y = y
    res_x = np.where(res_x < NANVAL, np.NaN, res_x)
    
    
    indices = np.where(np.isnan(res_x))
    if strategy==0:
      # Replace with mean
      means = np.nanmean(res_x, axis=0)
      res_x[indices] = np.take(means, indices[1]) 
    elif strategy==1:
      # Replace with median
      medians = np.nanmedian(res_x, axis=0)
      res_x[indices] = np.take(medians, indices[1])
    elif strategy==2:
      # Remove the NaN
      rows_with_nan = ~np.isnan(res_x).any(axis=1)
      res_y, res_x = res_y[rows_with_nan], res_x[rows_with_nan]
    elif strategy==3:
      # Remove columns with NaN
      columns_with_nan = ~np.isnan(res_x).any(axis=0)
      res_x = res_x[:,columns_with_nan]
    elif strategy==4:
      # Replace with 0
      res_x = np.nan_to_num(res_x)
    if outliers : 
      #TODO remove outliers
      pass
    if std: 
      res_x, _, _ = standardize(res_x)
    return res_y, res_x

In [None]:
y, x = preprocessing(y_train, tx_train, 4) # Replaces by 0
x

array([[1.224434, 0.193880, 0.741973, ..., -0.404581, -0.448681,
        0.927987],
       [1.491133, 0.397023, 0.806170, ..., -0.419301, -0.419301,
        0.129434],
       [-0.419301, 1.505793, 1.075848, ..., -0.419301, -0.419301,
        0.105989],
       ...,
       [0.832547, 0.299185, 0.480961, ..., -0.419301, -0.419301,
        0.079173],
       [0.707833, -0.189460, 0.397545, ..., -0.419301, -0.419301,
        -0.419301],
       [-0.419301, 0.444363, 0.421512, ..., -0.419301, -0.419301,
        -0.419301]])

In [None]:
y, x = preprocessing(y_train, tx_train,3) #Removes columns with NaN
x.shape

(250000, 19)

In [None]:
y, x = preprocessing(y_train, tx_train,2) #Removes NaN
x

array([[0.565378, -0.081980, 0.262313, ..., -0.457912, -0.485614,
        0.379160],
       [0.202040, -0.366120, -0.026099, ..., -0.465488, -0.443998,
        0.976916],
       [0.642063, -0.251942, 0.336545, ..., -0.466182, -0.487791,
        0.874140],
       ...,
       [0.473518, -0.250741, 0.243157, ..., -0.471625, -0.485987,
        0.972733],
       [0.502778, -0.437943, 0.030811, ..., -0.462849, -0.483675,
        3.604720],
       [1.151106, -0.115528, 0.001304, ..., -0.467591, -0.475637,
        0.831629]])

In [None]:
y, x = preprocessing(y_train, tx_train, 1) #Replaces with median
x


array([[1.073985, 0.094295, 0.615337, ..., -0.474629, -0.516552,
        0.792170],
       [1.327521, 0.287411, 0.676365, ..., -0.488735, -0.488645,
        0.033029],
       [0.779858, 1.341458, 0.932733, ..., -0.488735, -0.488645,
        0.010742],
       ...,
       [0.701440, 0.194402, 0.367206, ..., -0.488735, -0.488645,
        -0.014751],
       [0.582882, -0.270126, 0.287908, ..., -0.488735, -0.488645,
        -0.488622],
       [0.779858, 0.332415, 0.310692, ..., -0.488735, -0.488645,
        -0.488622]])

In [12]:
y, x = preprocessing(y_train, tx_train, 0) #Replaces with mean
x[-1]

array([0.759992, 0.259607, 0.239990, -0.405605, -0.457325, 3.306876,
       -0.490194, -0.461184, -0.405605, 0.366446, -0.472282, -0.496199,
       -0.477150, -0.043594, -0.464649, -0.454785, -0.071791, -0.466636,
       -0.475329, -0.066767, -0.498084, 0.531177, -0.481820, 0.382568,
       -0.481854, -0.481946, 0.105968, -0.481941, -0.481836, -0.481820])

In [13]:
#implementations.py
# /!\ Most of it is copied from the solutions
def least_squares(y, tx):
    """calculate the least squares."""
    a = tx.T.dot(tx)
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)
def ridge_regression(y, tx, lamb):
    """rige regression L2."""
    aI = lamb * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)

def sigmoid(x):
        return 1./ (1. + np.exp(-x))

def regression_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)

def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    sigmoids = sigmoid(tx.dot(w)) # N*1
    return tx.T.dot((sigmoids - y))

def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    #loss = self.regression_loss(y,tx)
    grad = calculate_gradient(y, tx, w)
    w = w - gamma * grad
    return w


#Functions we might not use
def split_data_equally(x, y, ratio, seed = 1):
    """
      Preserves the distribution
    """
    ind = np.arange(len(y))
    classes = set(y)
    indices_for_each_class = []
    train = np.empty(0, dtype=int)
    test = np.empty(0, dtype=int)
    for cl in classes :
        indices_for_each_class.append(ind[y==cl])
    for indices in indices_for_each_class:
        np.random.seed(seed)
        np.random.shuffle(indices)
        cut_ind = int(ratio * len(indices))
        train = np.hstack((train, indices[:cut_ind]))
        test = np.hstack((test, indices[cut_ind:]))
    print(train.shape)
    print(test.shape)
    return x[train], x[test], y[train], y[test]

# probably won't use either
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [14]:
# Classifier.py
class Classifier:
    """ 
    Abstract class to represent a classifier
    """
    def __init__(self):
        """ 
            Sets the parameters
        """
        raise NotImplementedError("Please Implement this method")
        
    def train(self, tx_train, y_train):
        """ 
            Learns a w.
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """        
        raise NotImplementedError("Please Implement this method")
    
    def predict(self, tx):
        """ 
            Returns a list of predictions. 
            For linear classifiers with classes {-1,1}, it just returns sign(self.score(x))
            Argument:
                - tx : N*D dimension
            Returns : 
                List[int] of size N
        """
        raise NotImplementedError("Please Implement this method")

    def accuracy(self, predictions, y):
        """
            Computes the accuracy for a list of predictions.
            It counts the number of good predictions and divides it by the number of samples.
            Arguments :
                - predictions : List of size N
                - y : List of size N
            Returns
                float : the accuracy

        """
        print(predictions)
        print(y)
        return np.sum(predictions==y) / len(y)

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : 
                {
                    'lambda_' : 0.01,
                    'n_iterations' : 10000,
                    'gamma' = 0.2
                }
            
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        raise NotImplementedError("Please Implement this method")


In [15]:

class ClassifierRidgeRegression(Classifier):

    def __init__(self, lambda_):
        """ 
            Sets the parameters
            Argument:
                - lambda_ : float parameter for the ridge regression
        """
        self.lambda_ = lambda_

    def train(self, y_train, tx_train):
        """ 
            Trains the model. It learns a w with ridge regression.
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        self.w = ridge_regression(y_train, tx_train, self.lambda_)         

    def predict(self, tx):
        """ 
            Returns a list of predictions 
            Argument:
                - tx : N*D dimension
            Returns : 
                List[int] of size N
        """
        return np.sign(tx.dot(self.w))
        
    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : 
                {
                    'lambda_' : 0.01,
                }
            
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        params['lambda'] = self.lambda_
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierRidgeRegression' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        return res



In [16]:
class ClassifierLeastSquares(Classifier):

    def __init__(self):
        """ 
            Does not have any parameters to set.
        """
        pass

    def train(self, y_train, tx_train):
        """ Trains the model. Learns a w with Least Squares. 
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        self.w = least_squares(y_train, tx_train)         
        
    def predict(self, x):
        """ 
            Returns a list of predictions.
            Argument:
                - x: a sample vector 1*D 
            Returns : 
                Array[int] 
        """
        return np.sign(x.dot(self.w))

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : {}
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierLeastSquares' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        return res
    

In [17]:
class ClassifierLogisticRegression(Classifier):
    """ 
    Abstract class to represent a classifier
    """
    def __init__(self, gamma=0.01, n_iterations = 100):
        """ 
            Sets parameters for logistic regression
            Argument:
                - gamma (float)
                - n_iterations (int)
        """
        self.gamma = gamma
        self.n_iterations = n_iterations

    def train(self, y_train, tx_train):
        """ 
            Trains the model. It learns a new w with logistic regression. 
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        self.w = np.empty(tx_train.shape[1])
        for _ in range(self.n_iterations):
            self.w = learning_by_gradient_descent(y_train, tx_train, self.w, self.gamma)
    
    
    def predict(self, x):
        """ 
            returns a list of predictions
            Argument:
                - x: a sample vector 1*D 
            Returns : 
                Array[int] 
        """
        pred = sigmoid(x.dot(self.w)) 
        pred = np.asarray([0 if k < 0.5 else 1 for k in pred])
        return pred

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : {}
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        params['n_iterations'] = self.n_iterations
        params['gamma'] = self.gamma
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierLogisticRegression' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        return res

In [18]:
class ClassifierRandomRidgeRegression(Classifier):

    def __init__(self, n_classifier, lambda_, features_per_classifier, degree, use_centroids=True, initial_number_of_features = 30):
        self.lambda_= lambda_
        self.n_classifier = n_classifier
        self.initial_number_of_features = initial_number_of_features
        self.features_per_classifier = features_per_classifier
        self.degree = degree
        self.clf = []
        self.features = [] # Each classifier will have random features. We choose them in the train function. Then we need them for our predictions.
        self.use_centroids = use_centroids

        for i in range(n_classifier):
            self.clf.append(ClassifierRidgeRegression(lambda_))

    def train(self, y_train, tx_train):
        """ Trains the model. Learns a w with Least Squares. 
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        # CAN ONLY BE USED WITH THE CURRENT VERSION OF BUILD_POLY
        
        
        
        #np.random.seed(seed)
        
        for cl in self.clf:
            perm = np.random.permutation(self.initial_number_of_features) # shuffle [0..32]
            perm = perm[:self.features_per_classifier] # Takes sqrt first elements
            features = [self.degree*k+1+j for k in perm for j in range(self.degree)]
            if self.use_centroids:
                features.append(tx_train.shape[1]-1)
                features.append(tx_train.shape[1]-2)
            self.features.append(features)
            tx = tx_train[:,features]
            cl.train(y_train, tx)        
        
    def predict(self, x):
        """ 
            Returns a list of predictions.
            Argument:
                - x: a sample vector 1*D 
            Returns : 
                Array[int] 
        """
        preds = np.empty(x.shape[0])

        for index, cl in enumerate(self.clf) :
            features = self.features[index]
            tx = x[:,features]
            preds = np.vstack((preds,cl.predict(tx)))
        preds = preds.mean(axis = 0)
        preds = np.sign(preds)
        return preds

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : {
                    'lambda_' : 0.01,
                    'n_classifier' : 100,
                    'features_per_classifier' : 6,
                    'degree' : 7,
                    'use_centroids' : True
                }
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        params['lambda_'] = self.lambda_
        params['n_classifier'] = self.n_classifier
        params['features_per_classifier'] = self.features_per_classifier
        params['degree'] = self.degree
        params['use_centroids'] = self.use_centroids
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierRandomRidgeRegression' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        print(res)
        return res

In [19]:
# Don't uncomment these lines, we don't have a tx_test.
# HOW TO USE : 
# lambda_ = 0.01
# clf = ClassifierRidgeRegression(lambda_)
# clf.train(y_train, tx_train)
# clf.get_params_and_results(tx_train, tx_test, y_train, y_test)


In [20]:
def build_centroids(y, x):
    res = []
    for cl in set(y):
      res.append(np.mean(x[y==cl], axis = 0))
    return res
# [centroid class -1, centroid class 1]

In [21]:
def kernel(x, centroid):
    return np.exp(-np.linalg.norm(x-centroid, axis = 1)**2)

In [22]:
def build_poly(x, degree, functions, centroids):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
        also applies functions
    """
    poly = np.ones((len(x), 1))

    for i in range(x.shape[1]):
        for deg in range(1, degree+1):
            poly = np.c_[poly, np.power(x[:, i], deg)]
        for f in functions:
            poly = np.c_[poly, f(x[:, i])]
    for c in centroids:
        poly = np.c_[poly, kernel(x,c)]

    return poly

In [23]:
#centroides = build_centroids(y,x)
#centroides

In [24]:
#tmp = build_poly(x,7,[], centroides) # This function takes 40 seconds. We can't put it inside a classifier. It would take forever.
#tmp.shape

In [25]:
#kernel(x, centroides[0])

In [26]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    # TODO : the same function but preserving the distribution
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

In [27]:
def cross_validation(y, x, k_indices, k, classifier):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    degree = 4
    centroids = build_centroids(y_tr, x_tr)
    
    tx_tr = build_poly(x_tr, degree, [], centroids) 
    tx_te = build_poly(x_te, degree, [], centroids) # Important to note : we use the same centroids for the training and the testing
    # train
    print('train start')
    classifier.train(y_tr, tx_tr) 
    print("train over")
    # Return our JSON
    return classifier.get_params_and_results(tx_tr, tx_te, y_tr, y_te)

In [28]:
import copy
def cross_validation_demo(y, x, clf, k_fold = 4, seed=1):
    #TODO : add a list of classifiers as an argument
    #TODO : add a list of functions for build_poly

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    res = []
    # cross validation
    for k in range(k_fold):
        res.append(cross_validation(y, x, k_indices, k, copy.deepcopy(clf)))
    return res

In [29]:
y_random, x_random = preprocessing(y_train, tx_train, 1) 


n_classifier = 100
lambda_ = 0.1
initial_number_of_features = 30
#features_per_classifier = int(np.sqrt(initial_number_of_features)) + 1
features_per_classifier = 15
degree = 4 # needs to be the same as we use in cross validation
use_centroids = True
clf = ClassifierRandomRidgeRegression(n_classifier, lambda_, features_per_classifier, degree, use_centroids, initial_number_of_features)
res = cross_validation_demo(y_random, x_random, clf)
res

KeyboardInterrupt: 

In [None]:
# Ridge Regression
lambda_ = 0.01
clf = ClassifierRidgeRegression(lambda_) # shouldn't take the input dimension as an argument
res = cross_validation_demo(y, x, clf)
res
# It takes almost 3 minutes to run because we perform build_poly 4 times on the whole dataset.
# We could fix it by calling build_poly before splitting the data however there would be a problem with the centroids
# as they would be the centroids of the whole dataset instead of the sub_dataset.

# Another way to fix this is to split build_poly into 2 functions. poly(tx, degree) which would extend the features (x1^2, x1^3, x2^2, x^3) and centroids(tx) that defines 
# the centroids for each label(-1 or 1) and calculates the distance between each centroid and each x_n (exp(-||x_n-centroid_i||²)).
# We would call poly(tx, degree) inside cross_validation_demo and centroids


train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 -1.000000]
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[1.000000 1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 1.000000]
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 1.000000

[{'name': 'ClassifierRidgeRegression',
  'accuracy_train': 0.7794133333333333,
  'accuracy_test': 0.779904,
  'params': {'lambda': 0.01}},
 {'name': 'ClassifierRidgeRegression',
  'accuracy_train': 0.7808853333333333,
  'accuracy_test': 0.77808,
  'params': {'lambda': 0.01}},
 {'name': 'ClassifierRidgeRegression',
  'accuracy_train': 0.7797333333333333,
  'accuracy_test': 0.781072,
  'params': {'lambda': 0.01}},
 {'name': 'ClassifierRidgeRegression',
  'accuracy_train': 0.779504,
  'accuracy_test': 0.779664,
  'params': {'lambda': 0.01}}]

In [None]:
lambda_ = 0.01
clf = ClassifierLeastSquares() # shouldn't take the input dimension as an argument
res = cross_validation_demo(y, x, clf)
res

train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 -1.000000]
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[1.000000 1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 1.000000]
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 1.000000

[{'name': 'ClassifierLeastSquares',
  'accuracy_train': 0.7857973333333333,
  'accuracy_test': 0.786736,
  'params': {}},
 {'name': 'ClassifierLeastSquares',
  'accuracy_train': 0.7869866666666666,
  'accuracy_test': 0.784464,
  'params': {}},
 {'name': 'ClassifierLeastSquares',
  'accuracy_train': 0.7866506666666667,
  'accuracy_test': 0.785952,
  'params': {}},
 {'name': 'ClassifierLeastSquares',
  'accuracy_train': 0.7855466666666666,
  'accuracy_test': 0.787616,
  'params': {}}]

In [None]:
gamma = 0.1
n_iterations = 10 
clf = ClassifierLogisticRegression(gamma, n_iterations)
formated_y = np.where(y<1, 0, y) # Logistic Regression doesn't work with labels {-1, 1} but only {0, 1}
res = cross_validation_demo(formated_y, x, clf)
res # We get better results when we don't use feature expansion.


train start


  return 1./ (1. + np.exp(-x))


train over
[0 0 0 ... 0 0 0]
[0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000]
[0 0 0 ... 0 0 0]
[1.000000 0.000000 0.000000 ... 1.000000 0.000000 0.000000]


KeyboardInterrupt: 

As you can see, we get bad results. I think it is because of the build_poly function.
Have a look at the results with a build_poly of degree 1.

In [36]:
def cross_validation_logistic(y, x, k_indices, k, classifier, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    centroids = build_centroids(y_tr, x_tr)
    
    tx_tr = build_poly(x_tr, degree, [], centroids) 
    tx_te = build_poly(x_te, degree, [], centroids) # Important to note : we use the same centroids for the training and the testing
    # train
    print('train start')
    classifier.train(y_tr, tx_tr) 
    print("train over")
    # Return our JSON
    return classifier.get_params_and_results(tx_tr, tx_te, y_tr, y_te)
def cross_validation_demo_logistic(y, x, clf, degree, k_fold = 4, seed=1):
    #TODO : add a list of classifiers as an argument
    #TODO : add a list of functions for build_poly

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    res = []
    # cross validation
    for k in range(k_fold):
        res.append(cross_validation_logistic(y, x, k_indices, k, copy.deepcopy(clf), degree))
    return res


In [38]:
y2, x2 = preprocessing(y_train, tx_train, 2, std=False) # Fill the NaN with 0, maybe we should try without standardization
y2 = np.where(y2<1, 0, y2) # Logistic Regression doesn't work with labels {-1, 1} but only {0, 1}

gamma = 0.1
n_iterations = 100
degree = 1
clf = ClassifierLogisticRegression(gamma, n_iterations)
res = cross_validation_demo_logistic(y2, x2, clf, degree)
res # We get better results when we don't use feature expansion.


train start


  return 1./ (1. + np.exp(-x))


train over
[0 0 0 ... 0 0 0]
[0.000000 1.000000 0.000000 ... 1.000000 0.000000 0.000000]
[0 0 0 ... 0 0 1]
[0.000000 1.000000 0.000000 ... 1.000000 0.000000 1.000000]
train start
train over
[0 0 0 ... 0 0 0]
[0.000000 1.000000 0.000000 ... 1.000000 0.000000 0.000000]
[0 0 0 ... 0 0 1]
[0.000000 1.000000 0.000000 ... 0.000000 0.000000 1.000000]
train start
train over
[0 0 0 ... 0 0 0]
[0.000000 1.000000 0.000000 ... 1.000000 0.000000 0.000000]
[0 0 0 ... 0 0 0]
[0.000000 0.000000 1.000000 ... 0.000000 0.000000 1.000000]
train start
train over
[0 0 0 ... 0 0 0]
[0.000000 1.000000 0.000000 ... 0.000000 0.000000 1.000000]
[1 1 0 ... 0 0 0]
[1.000000 1.000000 1.000000 ... 1.000000 0.000000 0.000000]


[{'name': 'ClassifierLogisticRegression',
  'accuracy_train': 0.606882781301386,
  'accuracy_test': 0.6102302090674183,
  'params': {'n_iterations': 100, 'gamma': 0.1}},
 {'name': 'ClassifierLogisticRegression',
  'accuracy_train': 0.6149087776994754,
  'accuracy_test': 0.6129903688043223,
  'params': {'n_iterations': 100, 'gamma': 0.1}},
 {'name': 'ClassifierLogisticRegression',
  'accuracy_train': 0.6175710594315246,
  'accuracy_test': 0.6191566831101715,
  'params': {'n_iterations': 100, 'gamma': 0.1}},
 {'name': 'ClassifierLogisticRegression',
  'accuracy_train': 0.6206444287839636,
  'accuracy_test': 0.61892177589852,
  'params': {'n_iterations': 100, 'gamma': 0.1}}]

In [41]:
def build_poly_interaction(x, degree, functions, centroids):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
        also applies functions
    """
    poly = np.ones((len(x), 1))
    for i in range(x.shape[1]):
        for deg in range(1, degree+1):
            poly = np.c_[poly, np.power(x[:, i], deg)]
        for f in functions:
            poly = np.c_[poly, f(x[:, i])]
        for j in range(i+1, x.shape[1]):
            poly = np.c_[poly, x[:,i] * x[:,j]]
            
    for c in centroids:
        poly = np.c_[poly, kernel(x,c)]

    return poly

def cross_validation_interaction(y, x, k_indices, k, classifier):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    degree = 7
    centroids = build_centroids(y_tr, x_tr)
    
    tx_tr = build_poly_interaction(x_tr, degree, [], centroids) 
    tx_te = build_poly_interaction(x_te, degree, [], centroids) # Important to note : we use the same centroids for the training and the testing
    # train
    print('train start')
    classifier.train(y_tr, tx_tr) 
    print("train over")
    # Return our JSON
    return classifier.get_params_and_results(tx_tr, tx_te, y_tr, y_te)

def cross_validation_demo_interaction(y, x, clf, k_fold = 4, seed=1):
    #TODO : add a list of classifiers as an argument
    #TODO : add a list of functions for build_poly

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    res = []
    # cross validation
    for k in range(k_fold):
        tmp = cross_validation_interaction(y, x, k_indices, k, copy.deepcopy(clf))
        print(tmp)
        res.append(tmp)
    return res

In [42]:
lambda_ = 0.01
clf = ClassifierRidgeRegression(lambda_) # shouldn't take the input dimension as an argument
res = cross_validation_demo_interaction(y, x, clf)
res

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
train start
train over
[-1.000000 -1.000000 -1.000000 ... -1.000000 1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000]
[-1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 -1.000000]
[1.000000 -1.000000 -1.000000 ... 1.000000 -1.000000 -1.000000]
{'name': 'ClassifierRidgeRegression', 'accuracy_train': 0.8142933333333333, 'accuracy_test': 0.81176, 'params': {'lambda': 0.01}}
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370


KeyboardInterrupt: 