# 📥 Making predictions / classifying

In [24]:
import numpy as np

import sys
sys.path.insert(0, '../scripts') 

from proj1_helpers import *         # Not necessary to copy function for loading CSV data anymore
from preprocessing_helpers import * # All function related to preprocessing are now in this helper script 
                                    # (in the scripts-directory)

## (1) Read in the pre-processed data written by Preprocessing.ipynb 

In [25]:
TRAIN = '../data/train.csv' # due to directory structure, the data directory is now one directory above this one
TEST = '../data/test.csv'

In [26]:
y_train, tx_train, ids_train = load_csv_data(TRAIN)

## (2) Functions required by the different types of models

### Model a) Least Squares 

In [28]:
def least_squares(y, tx):
    """calculate the least squares."""
    a = tx.T.dot(tx)
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)

### Model b) Gradient Descent

In [29]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    sigmoids = sigmoid(tx.dot(w)) # N*1
    return tx.T.dot((sigmoids - y))

In [30]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    #loss = self.regression_loss(y,tx)
    grad = calculate_gradient(y, tx, w)
    w = w - gamma * grad
    return w

### Model c) Ridge Regression

In [31]:
def ridge_regression(y, tx, lamb):
    """rige regression L2."""
    aI = lamb * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)

### Model d) Log Reg

In [32]:
def sigmoid(x):
        return 1./ (1. + np.exp(-x))

In [33]:
def regression_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)

## (3) Classifier classes

### Classifier a) Abstract base class 

In [34]:
# Classifier.py
class Classifier:
    """ 
    Abstract class to represent a classifier
    """
    def __init__(self):
        """ 
            Sets the parameters
        """
        raise NotImplementedError("Please Implement this method")
        
    def train(self, tx_train, y_train):
        """ 
            Learns a w.
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """        
        raise NotImplementedError("Please Implement this method")
    
    def predict(self, tx):
        """ 
            Returns a list of predictions. 
            For linear classifiers with classes {-1,1}, it just returns sign(self.score(x))
            Argument:
                - tx : N*D dimension
            Returns : 
                List[int] of size N
        """
        raise NotImplementedError("Please Implement this method")

    def accuracy(self, predictions, y):
        """
            Computes the accuracy for a list of predictions.
            It counts the number of good predictions and divides it by the number of samples.
            Arguments :
                - predictions : List of size N
                - y : List of size N
            Returns
                float : the accuracy

        """
        print(predictions)
        print(y)
        return np.sum(predictions==y) / len(y)

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : 
                {
                    'lambda_' : 0.01,
                    'n_iterations' : 10000,
                    'gamma' = 0.2
                }
            
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        raise NotImplementedError("Please Implement this method")


### Classifier b) Ridge Regression Classifier

In [35]:
# Don't uncomment these lines, we don't have a tx_test.
# HOW TO USE : 
# lambda_ = 0.01
# clf = ClassifierRidgeRegression(lambda_)
# clf.train(y_train, tx_train)
# clf.get_params_and_results(tx_train, tx_test, y_train, y_test)

In [36]:
class ClassifierRidgeRegression(Classifier):

    def __init__(self, lambda_):
        """ 
            Sets the parameters
            Argument:
                - lambda_ : float parameter for the ridge regression
        """
        self.lambda_ = lambda_

    def train(self, y_train, tx_train):
        """ 
            Trains the model. It learns a w with ridge regression.
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        self.w = ridge_regression(y_train, tx_train, self.lambda_)         

    def predict(self, tx):
        """ 
            Returns a list of predictions 
            Argument:
                - tx : N*D dimension
            Returns : 
                List[int] of size N
        """
        return np.sign(tx.dot(self.w))
        
    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : 
                {
                    'lambda_' : 0.01,
                }
            
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        params['lambda'] = self.lambda_
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierRidgeRegression' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        return res

In [37]:
class ClassifierRandomRidgeRegression(Classifier):

    def __init__(self, n_classifier, lambda_, features_per_classifier, degree, use_centroids=True, initial_number_of_features = 30):
        self.lambda_= lambda_
        self.n_classifier = n_classifier
        self.initial_number_of_features = initial_number_of_features
        self.features_per_classifier = features_per_classifier
        self.degree = degree
        self.clf = []
        self.features = [] # Each classifier will have random features. We choose them in the train function. Then we need them for our predictions.
        self.use_centroids = use_centroids

        for i in range(n_classifier):
            self.clf.append(ClassifierRidgeRegression(lambda_))

    def train(self, y_train, tx_train):
        """ Trains the model. Learns a w with Least Squares. 
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        # CAN ONLY BE USED WITH THE CURRENT VERSION OF BUILD_POLY
        
        #np.random.seed(seed)
        
        for cl in self.clf:
            perm = np.random.permutation(self.initial_number_of_features) # shuffle [0..32]
            perm = perm[:self.features_per_classifier] # Takes sqrt first elements
            features = [self.degree*k+1+j for k in perm for j in range(self.degree)]
            if self.use_centroids:
                features.append(tx_train.shape[1]-1)
                features.append(tx_train.shape[1]-2)
            self.features.append(features)
            tx = tx_train[:,features]
            cl.train(y_train, tx)        
        
    def predict(self, x):
        """ 
            Returns a list of predictions.
            Argument:
                - x: a sample vector 1*D 
            Returns : 
                Array[int] 
        """
        preds = np.empty(x.shape[0])

        for index, cl in enumerate(self.clf) :
            features = self.features[index]
            tx = x[:,features]
            preds = np.vstack((preds,cl.predict(tx)))
        preds = preds.mean(axis = 0)
        preds = np.sign(preds)
        return preds

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : {
                    'lambda_' : 0.01,
                    'n_classifier' : 100,
                    'features_per_classifier' : 6,
                    'degree' : 7,
                    'use_centroids' : True
                }
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        params['lambda_'] = self.lambda_
        params['n_classifier'] = self.n_classifier
        params['features_per_classifier'] = self.features_per_classifier
        params['degree'] = self.degree
        params['use_centroids'] = self.use_centroids
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierRandomRidgeRegression' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        print(res)
        return res

### Classifier c) Least Squares Classifier

In [38]:
class ClassifierLeastSquares(Classifier):

    def __init__(self):
        """ 
            Does not have any parameters to set.
        """
        pass

    def train(self, y_train, tx_train):
        """ Trains the model. Learns a w with Least Squares. 
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        self.w = least_squares(y_train, tx_train)         
        
    def predict(self, x):
        """ 
            Returns a list of predictions.
            Argument:
                - x: a sample vector 1*D 
            Returns : 
                Array[int] 
        """
        return np.sign(x.dot(self.w))

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : {}
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierLeastSquares' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        return res

### Classifier d) Log Reg Classifier 

In [39]:
class ClassifierLogisticRegression(Classifier):
    """ 
    Abstract class to represent a classifier
    """
    def __init__(self, gamma=0.01, n_iterations = 100):
        """ 
            Sets parameters for logistic regression
            Argument:
                - gamma (float)
                - n_iterations (int)
        """
        self.gamma = gamma
        self.n_iterations = n_iterations

    def train(self, y_train, tx_train):
        """ 
            Trains the model. It learns a new w with logistic regression. 
            Arguments:
                - tx_train: ndarray matrix of size N*D
                - y_train: ndarray matrix of size D*1
            Hypothesis: tx_train ndd y_train have the same length
        """
        self.w = np.empty(tx_train.shape[1])
        for _ in range(self.n_iterations):
            self.w = learning_by_gradient_descent(y_train, tx_train, self.w, self.gamma)
    
    
    def predict(self, x):
        """ 
            returns a list of predictions
            Argument:
                - x: a sample vector 1*D 
            Returns : 
                Array[int] 
        """
        pred = sigmoid(x.dot(self.w)) 
        pred = np.asarray([0 if k < 0.5 else 1 for k in pred])
        return pred

    def get_params_and_results(self, tx_train, tx_test, y_train, y_test):
        """
            Returns a dictionnary with the parameters and the accuracy
            example of output : 
            {
                'name' : 'Classifier',
                'accuracy_train' : 0.8, 
                'accuracy_test' : 0.78,
                'params' : {}
            }
            Arguments : 
                tx_train : N * D train set
                tx_test : N' * D' test set
                y_train : N * 1 train labels 
                y_test : N' * 1 test labels
            Returns :
                dictionnary of parameters and accuracy
        """
        # Compute my predictions
        predictions_train = self.predict(tx_train)
        predictions_test = self.predict(tx_test)
        #Construct a dictionnary of parameters
        params = dict()
        params['n_iterations'] = self.n_iterations
        params['gamma'] = self.gamma
        #construct the final dictionnary
        res = dict()
        res['name'] = 'ClassifierLogisticRegression' 
        res['accuracy_train'] = self.accuracy(predictions_train, y_train)
        res['accuracy_test'] = self.accuracy(predictions_test, y_test)
        res['params'] = params
        return res

## (4) Split the data 

### Strategy a) Splitting according to a ratio 

In [40]:
#Functions we might not use
def split_data_equally(x, y, ratio, seed = 1):
    """
      Preserves the distribution
    """
    ind = np.arange(len(y))
    classes = set(y)
    indices_for_each_class = []
    train = np.empty(0, dtype=int)
    test = np.empty(0, dtype=int)
    for cl in classes :
        indices_for_each_class.append(ind[y==cl])
    for indices in indices_for_each_class:
        np.random.seed(seed)
        np.random.shuffle(indices)
        cut_ind = int(ratio * len(indices))
        train = np.hstack((train, indices[:cut_ind]))
        test = np.hstack((test, indices[cut_ind:]))
    print(train.shape)
    print(test.shape)
    return x[train], x[test], y[train], y[test]

# probably won't use either
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

### Strategy b) Splitting using k-indices 

In [41]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    # TODO : the same function but preserving the distribution
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

## (5) Cross-Validation 

In [42]:
def cross_validation(y, x, k_indices, k, classifier):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    degree = 4
    centroids = build_centroids(y_tr, x_tr)
    
    tx_tr = build_poly(x_tr, degree, [], centroids) 
    tx_te = build_poly(x_te, degree, [], centroids) # Important to note : we use the same centroids for the training and the testing
    # train
    print('train start')
    classifier.train(y_tr, tx_tr) 
    print("train over")
    # Return our JSON
    return classifier.get_params_and_results(tx_tr, tx_te, y_tr, y_te)

In [43]:
import copy
def cross_validation_demo(y, x, clf, k_fold = 4, seed=1):
    #TODO : add a list of classifiers as an argument
    #TODO : add a list of functions for build_poly

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    res = []
    # cross validation
    for k in range(k_fold):
        res.append(cross_validation(y, x, k_indices, k, copy.deepcopy(clf)))
    return res

### Below cross-validation demos by Ali 🐸 

In [44]:
y_random, x_random = preprocessing(y_train, tx_train, 1) 


n_classifier = 100
lambda_ = 0.1
initial_number_of_features = 30
#features_per_classifier = int(np.sqrt(initial_number_of_features)) + 1
features_per_classifier = 15
degree = 4 # needs to be the same as we use in cross validation
use_centroids = True
clf = ClassifierRandomRidgeRegression(n_classifier, lambda_, features_per_classifier, degree, use_centroids, initial_number_of_features)
res = cross_validation_demo(y_random, x_random, clf)
res

train start
train over
[-1. -1. -1. ... -1.  1. -1.]
[-1. -1. -1. ... -1. -1. -1.]
[-1. -1. -1. ...  1. -1. -1.]
[ 1. -1. -1. ...  1. -1. -1.]
{'name': 'ClassifierRandomRidgeRegression', 'accuracy_train': 0.7744906666666667, 'accuracy_test': 0.774752, 'params': {'lambda_': 0.1, 'n_classifier': 100, 'features_per_classifier': 15, 'degree': 4, 'use_centroids': True}}
train start


KeyboardInterrupt: 

In [None]:
# Ridge Regression
lambda_ = 0.01
clf = ClassifierRidgeRegression(lambda_) # shouldn't take the input dimension as an argument
res = cross_validation_demo(y, x, clf)
res
# It takes almost 3 minutes to run because we perform build_poly 4 times on the whole dataset.
# We could fix it by calling build_poly before splitting the data however there would be a problem with the centroids
# as they would be the centroids of the whole dataset instead of the sub_dataset.

# Another way to fix this is to split build_poly into 2 functions. poly(tx, degree) which would extend the features (x1^2, x1^3, x2^2, x^3) and centroids(tx) that defines 
# the centroids for each label(-1 or 1) and calculates the distance between each centroid and each x_n (exp(-||x_n-centroid_i||²)).
# We would call poly(tx, degree) inside cross_validation_demo and centroids

In [None]:
lambda_ = 0.01
clf = ClassifierLeastSquares() # shouldn't take the input dimension as an argument
res = cross_validation_demo(y, x, clf)
res

In [None]:
gamma = 0.1
n_iterations = 10 
clf = ClassifierLogisticRegression(gamma, n_iterations)
formated_y = np.where(y<1, 0, y) # Logistic Regression doesn't work with labels {-1, 1} but only {0, 1}
res = cross_validation_demo(formated_y, x, clf)
res # We get better results when we don't use feature expansion.

As you can see, we get bad results. I think it is because of the build_poly function.
Have a look at the results with a build_poly of degree 1.

In [None]:
def cross_validation_logistic(y, x, k_indices, k, classifier, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    centroids = build_centroids(y_tr, x_tr)
    
    tx_tr = build_poly(x_tr, degree, [], centroids) 
    tx_te = build_poly(x_te, degree, [], centroids) # Important to note : we use the same centroids for the training and the testing
    # train
    print('train start')
    classifier.train(y_tr, tx_tr) 
    print("train over")
    # Return our JSON
    return classifier.get_params_and_results(tx_tr, tx_te, y_tr, y_te)
def cross_validation_demo_logistic(y, x, clf, degree, k_fold = 4, seed=1):
    #TODO : add a list of classifiers as an argument
    #TODO : add a list of functions for build_poly

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    res = []
    # cross validation
    for k in range(k_fold):
        res.append(cross_validation_logistic(y, x, k_indices, k, copy.deepcopy(clf), degree))
    return res

In [None]:
y2, x2 = preprocessing(y_train, tx_train, 2, std=False) # Fill the NaN with 0, maybe we should try without standardization
y2 = np.where(y2<1, 0, y2) # Logistic Regression doesn't work with labels {-1, 1} but only {0, 1}

gamma = 0.1
n_iterations = 100
degree = 1
clf = ClassifierLogisticRegression(gamma, n_iterations)
res = cross_validation_demo_logistic(y2, x2, clf, degree)
res # We get better results when we don't use feature expansion.

In [None]:
def cross_validation_demo_interaction(y, x, clf, k_fold = 4, seed=1):
    #TODO : add a list of classifiers as an argument
    #TODO : add a list of functions for build_poly

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    res = []
    # cross validation
    for k in range(k_fold):
        tmp = cross_validation_interaction(y, x, k_indices, k, copy.deepcopy(clf))
        print(tmp)
        res.append(tmp)
    return res

In [None]:
lambda_ = 0.01
clf = ClassifierRidgeRegression(lambda_) # shouldn't take the input dimension as an argument
res = cross_validation_demo_interaction(y, x, clf)
res

In [None]:
def cross_validation_interaction(y, x, k_indices, k, classifier):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    # form data with polynomial degree
    degree = 7
    centroids = build_centroids(y_tr, x_tr)
    
    tx_tr = build_poly_interaction(x_tr, degree, [], centroids) 
    tx_te = build_poly_interaction(x_te, degree, [], centroids) # Important to note : we use the same centroids for the training and the testing
    # train
    print('train start')
    classifier.train(y_tr, tx_tr) 
    print("train over")
    # Return our JSON
    return classifier.get_params_and_results(tx_tr, tx_te, y_tr, y_te)