In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [24]:
##Implementations of the required function

#Linear regression using gradient descent
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    nIters = 0 #Keep count of iterations
    w = initial_w #Current weights
    n = len(y) #Number of observations
    while (nIters < max_iters):
        e = y - np.dot(tx, w) #Residual vector
        gradient = -np.dot(np.transpose(tx), e) / n #Gradient
        w -= gamma * gradient #A step towards negative gradient
        nIters += 1 #Update number of iterations
    e = y - np.dot(tx, w) #Compute final residuals
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return weights and loss (2n as a scaler)

#Linear regression using stochastic gradient descent
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    nIters = 0 #Keep track of iterations
    w = initial_w #Update w
    n = len(y) #Number of datapoints
    while (nIters < max_iters):
        index = np.random.randint(0, n) #Pick a row uniformly
        row = tx[np.random.randint(0, n), ] #Select the chosen row 
        e = y[index] - np.dot(row, w) #Calculate the estimate for error
        gradient = -np.dot(np.transpose(row), e) #Calculate the estimate for the gradient
        w -= gamma * gradient #Update w
        nIters += 1 #Update number of iterations
    e = y - np.dot(tx, w) #Calculate the residuals for the final loss
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return the weights and the loss (2n as a scaler)

#Least squares regression using normal equations
def least_squares(y, tx):
    xtx = np.dot(np.transpose(tx), tx) #Calculate the Gram matrix
    w = np.dot(np.dot(np.linalg.inv(xtx), np.transpose(tx)), y) #Calculate the weigths
    e = y - np.dot(tx, w) #Calculate the residuals
    loss = np.dot(np.transpose(e), e) / (2 * len(y)) #Calculate the loss (2n as a scaler)
    return w, loss

#Ridge regression using normal equations
def ridge_regression(y, tx, lambda_):
    xtx = np.dot(np.transpose(tx), tx) + lambda_ * np.identity(np.shape(tx)[1]) #Calculate the modified Gram matrix
    w = np.dot(np.dot(np.linalg.inv(xtx), np.transpose(tx)), y) #Calculate the weigths
    e = y - np.dot(tx, w) #Calculate residuals
    loss = np.dot(np.transpose(e), e) / (2 * len(y)) #Calculate the loss (2n as a scaler)
    return w, loss

#A helper for the logistic regression
def sigmoid(x):
    return np.exp(x) / (1 + np.exp(x))

#Logistic regression using gradient descent
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    nIters = 0
    w = initial_w
    n = len(y)
    while (nIters < max_iters): 
        gradient = np.dot(np.transpose(tx), sigmoid(np.dot(tx, w)) - y)
        w -= gamma * gradient #Calculate new w
        nIters += 1 #Update number of iterations
    loss = np.sum(np.log(1 + np.exp(np.dot(tx, w))) - y * np.dot(tx, w))
    return w, loss #Return weights and loss

#Regularized logistic regression using gradient descent or SGD
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    
    nIters = 0
    w = initial_w
    n = len(y)
    while (nIters < max_iters): 
        
        #calculate the hessian matrix
        S = np.zeros(n)
        for k in range(n):
            sig = sigmoid(np.dot(np.transpose(tx[k]), w))
            S[k] = sig * (1 - sig)
        
        tmp = np.zeros((n, np.shape(tx)[1]))
        for k in range(n):
            tmp[k, :] = tx[k, :] * S[k]
        hessian = np.dot(np.transpose(tx), tmp)
        
        #calculate the gradient
        gradient = np.dot(np.transpose(tx), sigmoid(np.dot(tx, w)) - y)
        
        #update w
        w = w - gamma*np.dot(np.linalg.inv(hessian),gradient)
        
        nIters += 1 #Update number of iterations
        
    #calculate the loss
    loss = 0
    for k in range(n):
        loss += np.log(1 + np.exp(np.dot(np.transpose(tx[k]),w))) - np.dot(y[k],np.dot(np.transpose(tx[k]),w))
    loss += lambda_/2* np.linalg.norm(w)
    
    return w, loss #Return weights and loss

## Tests:
#w1, loss1 = least_squares(y, tX)
#w2, loss2 = ridge_regression(y, tX, 0.02)
#w3, loss3 = least_squares_GD(y, tX, 10 * w2, 20, 0.01)
#w4, loss4 = least_squares_SGD(y, tX, np.ones(30), 10, 0.01)
#y2 = np.copy(y)
#y2[y2 == -1] = 0
#w4, loss4 = logistic_regression(y2, tX, np.ones(30), 10, 0.01)


In [48]:
## Functions for cleaning and manipulating the data
def replace_with_mean(data, replacedValue):
    means = []
    def replace_one_col(x):
        m = np.mean(x[x != replacedValue])
        x[x == replacedValue] = m
        means.append(m)
        return x
    return np.apply_along_axis(replace_one_col, 0, np.copy(data)), means

def replace_with_values(data, values, replacedValue):
    result = np.copy(data)
    for i in range(np.shape(data)[1]):
        for j in range(np.shape(data)[0]):
            if result[j, i] == replacedValue:
                result[j, i] = values[i]
    return result
            
def centralize_data(data):
    return np.apply_along_axis(lambda x: x - np.mean(x), 0, np.copy(data))

def subtract_values(data, values):
    result = np.copy(data)
    for i in range(np.shape(data)[1]):
        for j in range(np.shape(data)[0]):
            result[j, i] = result[j, i] - values[i]
    return result


def scale_data(data):
    deviations = np.apply_along_axis(lambda x: np.std(x), 0, np.copy(data))
    return np.apply_along_axis(lambda x: x / np.std(x), 0, np.copy(data)), deviations

def divide_by_values(data, values):
    result = np.copy(data)
    for i in range(np.shape(data)[1]):
        for j in range(np.shape(data)[0]):
            result[j, i] = result[j, i] / values[i]
    return result


In [42]:
## Functions for model validation
def split_data(x, y, ratio, seed = 1):
    #Split the dataset based on the split ratio.
    np.random.seed(seed) #Set a seed for reproducilbility
    trainSize = int(np.round(len(y) * ratio)) #Calculate the desired size of the train data set 
    trainIndexes = np.random.choice(len(y), trainSize, False) #Sample the training data
    testIndexes = np.setdiff1d(np.arange(len(y)), trainIndexes, assume_unique=True) #Select the test data
    return x[trainIndexes, :], y[trainIndexes], x[testIndexes, :], y[testIndexes]


In [46]:
#Load test data
DATA_TEST_PATH = '../../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [58]:
### Fit models

## Model 1: Simple regularized logistic regression model

#Manipulate data
tX2, means = replace_with_mean(tX, -999)
tX3, deviations = scale_data(centralize_data(tX2))
y2 = np.copy(y)
y2[y2 == -1] = 0
#Fit model
weights, _ = reg_logistic_regression(y2, tX3, 0.01, np.zeros(30), 20, 0.1)
#Calculate accuracy for the train data
trainPredictions = sigmoid(np.dot(tX3, w))
trainPredictions[trainPredictions < 0.5] = 0
trainPredictions[trainPredictions > 0] = 1
np.mean(trainPredictions == y2) ## 72 % correct
np.mean(trainPredictions == 0) ## 51 % correct

## Do predictions for the test data
tX_test2 = divide_by_values(subtract_values(replace_with_values(tX_test, means, -999), means), deviations)
testPredictions = sigmoid(np.dot(tX_test2, w))
testPredictions[testPredictions < 0.5] = 0
testPredictions[testPredictions > 0] = 1

#Save predictions
OUTPUT_PATH = '../../predictions/naive_regularized_logistic_regression.csv' 
create_csv_submission(ids_test, testPredictions, OUTPUT_PATH)


In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)