In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [20]:
##Implementations of the required function

#Linear regression using gradient descent
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    nIters = 0 #Keep count of iterations
    w = initial_w #Current weights
    n = len(y) #Number of observations
    while (nIters < max_iters):
        e = y - np.dot(tx, w) #Residual vector
        gradient = -np.dot(np.transpose(tx), e) / n #Gradient
        w -= gamma * gradient #A step towards negative gradient
        nIters += 1 #Update number of iterations
    e = y - np.dot(tx, w) #Compute final residuals
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return weights and loss (2n as a scaler)

#Linear regression using stochastic gradient descent
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    nIters = 0
    w = initial_w
    n = len(y)
    while (nIters < max_iters):
        index = np.random.randint(0, n) #Pick a row uniformly
        row = tx[np.random.randint(0, n), ] 
        e = y[index] - np.dot(row, w) #Estimate for error
        gradient = -np.dot(np.transpose(row), e) #Estimate for the gradient
        w -= gamma * gradient #Calculate new w
        nIters += 1 #Update number of iterations
    e = y - np.dot(tx, w) #Calculate the residuals for the final loss
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return the weights and the loss (2n as a scaler)

#Least squares regression using normal equations
def least_squares(y, tx):
    xtx = np.dot(np.transpose(tx), tx) #Calculate the Gram matrix
    w = np.dot(np.dot(np.linalg.inv(xtx), np.transpose(tx)), y) #Calculate the weigths
    e = y - np.dot(tx, w) #Calculate the residuals
    loss = np.dot(np.transpose(e), e) / (2 * len(y)) #Calculate the loss (2n as a scaler)
    return w, loss

#Ridge regression using normal equations
def ridge_regression(y, tx, lambda_):
    xtx = np.dot(np.transpose(tx), tx) + lambda_ * np.identity(np.shape(tx)[1]) #Calculate the modified Gram matrix
    w = np.dot(np.dot(np.linalg.inv(xtx), np.transpose(tx)), y) #Calculate the weigths
    e = y - np.dot(tx, w) #Calculate residuals
    loss = np.dot(np.transpose(e), e) / (2 * len(y)) #Calculate the loss (2n as a scaler)
    return w, loss

#Logistic regression using gradient descent
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    nIters = 0
    w = initial_w
    n = len(y)
    while (nIters < max_iters): 
        gradient = np.dot(np.transpose(tx), sigmoid(np.dot(tx, w)) - y)
        w -= gamma * gradient #Calculate new w
        nIters += 1 #Update number of iterations
    loss = np.sum(np.log(1 + np.exp(np.dot(tx, w))) - y * np.dot(tx, w))
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return weights and loss (2n as a scaler)

#Regularized logistic regression using gradient descent or SGD
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    return 0

## Tests:
w1, loss1 = least_squares(y, tX)
w2, loss2 = ridge_regression(y, tX, 0.02)
w3, loss3 = least_squares_GD(y, tX, 10 * w2, 20, 0.01)
w4, loss4 = least_squares_SGD(y, tX, np.ones(30), 10, 0.01)
print(w1)


[ 4.10811719e-05 -7.58093451e-03 -5.60715130e-03 -5.22544075e-04
 -4.47382865e-02  5.16542825e-04 -1.53565134e-02  2.97133279e-01
 -1.39315245e-03 -2.79211972e+00 -2.48659690e-01  9.82514531e-02
  3.45996114e-01  2.79904928e+00 -4.47852617e-04 -7.39548246e-04
  2.80533578e+00 -1.09832905e-03  1.23511990e-03  3.11742132e-03
  5.50036683e-04 -5.12787102e-04 -2.84800464e-01 -5.30172680e-03
  4.68129559e-04  3.69288507e-04 -9.76263056e-03  1.50504847e-03
 -2.16592538e-03  2.79711320e+00]


In [21]:
## Functions for cleaning the data
def replace_with_mean(data, replacedValue):
    nRows = np.shape(data)[0]
    nCols = np.shape(data)[1]
    for j in range(0, nCols):
        rowSum = 0
        n = 0
        toBeReplaced = [];
        for i in range(0, nRows):
            if data[i, j] != replacedValue:
                rowSum += data[i, j]
                n += 1
            else:
                toBeReplaced.append(i)
        data[toBeReplaced, j] = rowSum / n
    return data
            
def centeralize_data(data):
    return np.apply_along_axis(lambda x: x - np.mean(x), 0, data)

def scale_data(data):
    return np.apply_along_axis(lambda x: x / np.std(x), 0, data)



In [18]:
## Functions for separating data sets to train and test data
def split_data(x, y, ratio, seed=1):
    #Split the dataset based on the split ratio.
    np.random.seed(seed) #Set a seed for reproducilbility
    trainSize = int(np.round(len(y) * ratio)) #Calculate the desired size of the train data set 
    trainIndexes = np.random.choice(len(y), trainSize, False) #Sample the training data
    testIndexes = np.setdiff1d(np.arange(len(y)), trainIndexes, assume_unique=True) #Select the test data
    return x[trainIndexes], y[trainIndexes, ], x[testIndexes], y[testIndexes]


(250000,)

In [10]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)