In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [3]:
##Implementations of the required function

#Linear regression using gradient descent
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    nIters = 0 #Keep count of iterations
    w = initial_w #Current weights
    n = len(y) #Number of observations
    while (nIters < max_iters):
        e = y - np.dot(tx, w) #Residual vector
        gradient = -np.dot(np.transpose(tx), e) / n #Gradient
        w -= gamma * gradient #A step towards negative gradient
        nIters += 1 #Update number of iterations
    e = y - np.dot(tx, w) #Compute final residuals
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return weights and loss (2n as a scaler)

#Linear regression using stochastic gradient descent
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    nIters = 0
    w = initial_w
    n = len(y)
    while (nIters < max_iters):
        index = np.random.randint(0, n) #Pick a row uniformly
        row = tx[np.random.randint(0, n), ] 
        e = y[index] - np.dot(row, w) #Estimate error
        gradient = -np.dot(np.transpose(row), e) #Estimate for gradient
        w -= gamma * gradient #Calculate new w
        nIters += 1 #Update number of iterations
    e = y - np.dot(tx, w) #Calculate residuals for final loss
    return w, np.dot(np.transpose(e), e) / (2 * n) #Return weights and loss (2n as a scaler)

#Least squares regression using normal equations
def least_squares(y, tx):
    xtx = np.dot(np.transpose(tx), tx) #Calculate Gram matrix
    w = np.dot(np.dot(np.linalg.inv(xtx), np.transpose(tx)), y) #Calculate weigths
    e = y - np.dot(tx, w) #Calculate residuals
    loss = np.dot(np.transpose(e), e) / (2 * len(y)) #Calculate loss (2n as scaler)
    return w, loss

#Ridge regression using normal equations
def ridge_regression(y, tx, lambda_):
    xtx = np.dot(np.transpose(tx), tx) + lambda_ * np.identity(np.shape(tx)[1]) #Calculate modified Gram matrix
    w = np.dot(np.dot(np.linalg.inv(xtx), np.transpose(tx)), y) #Calculate weigths
    e = y - np.dot(tx, w) #Calculate residuals
    loss = np.dot(np.transpose(e), e) / (2 * len(y)) #Calculate loss (2n as scaler)
    return w, loss

#Logistic regression using gradient descent or SGD
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    return 0
#Regularized logistic regression using gradient descent or SGD
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    return 0

## Tests:
w1, loss1 = least_squares(y, tX)
w2, loss2 = ridge_regression(y, tX, 0.02)
w3, loss3 = least_squares_GD(y, tX, 10 * w2, 20, 0.01)
w4, loss4 = least_squares_SGD(y, tX, np.ones(30), 10, 0.01)
print(w1)


[ 8.03320457e-05 -7.20236456e-03 -6.05394729e-03 -5.47591803e-04
 -1.93839880e-02  4.73429232e-04 -2.60392491e-02  3.25105641e-01
 -3.80831024e-05 -2.72785891e+00 -2.21220397e-01  9.50793593e-02
  6.40330465e-02  2.73611857e+00 -3.31802268e-04 -9.54326312e-04
  2.74087533e+00 -5.34165279e-04  9.73498884e-04  3.69225050e-03
  3.54487165e-04 -5.43344617e-04 -3.30448034e-01 -1.40800496e-03
  8.31432874e-04  1.02117271e-03 -1.68047418e-03 -5.83664770e-03
 -1.11088005e-02  2.72831885e+00]


In [14]:
## Functions for cleaning the data
def replaceWithMean(data, replacedValue):
    nRows = np.shape(data)[0]
    nCols = np.shape(data)[1]
    for j in range(0, nCols):
        rowSum = 0
        n = 0
        toBeReplaced = [];
        for i in range(0, nRows):
            if data[i, j] != replacedValue:
                rowSum += data[i, j]
                n += 1
            else:
                toBeReplaced.append(i)
        data[toBeReplaced, j] = rowSum / n
    return data
            
def centeralizeData(data):
    return np.apply_along_axis(lambda x: x - np.mean(x), 0, data)

def scaleData(data):
    return np.apply_along_axis(lambda x: x / np.std(x), 0, data)

scaledData = scaleData(centeralizeData(replaceWithMean(tX, -999)))


[[ 3.14910656e-01  6.83319669e-02  4.07680272e-01 -4.69966242e-01
  -1.59163777e+00 -1.15330600e+00  1.80634634e+00  8.82477616e-01
   1.03309853e+00  3.39894409e-01]
 [ 7.40827026e-01  5.52504823e-01  5.40136414e-01 -1.53167493e-01
   8.51752464e-15 -4.77609639e-15 -1.20751602e-15  1.40488790e+00
  -7.56027155e-01 -2.87584486e-01]
 [-2.96341266e-14  3.19515553e+00  1.09655998e+00 -3.49709651e-01
   8.51752464e-15 -4.77609639e-15 -1.20751602e-15  9.89769704e-01
  -4.30168330e-01  3.40361109e-01]
 [ 4.17944237e-01  9.10379098e-01 -5.85328814e-03 -9.03015649e-01
   8.51752464e-15 -4.77609639e-15 -1.20751602e-15  1.19669016e+00
  -8.30734938e-01 -7.12705477e-01]
 [ 1.02380444e+00 -9.14556190e-01  1.31336873e+00 -6.51804263e-01
   8.51752464e-15 -4.77609639e-15 -1.20751602e-15  1.93879376e+00
  -1.12794943e-01 -8.68142689e-01]
 [-6.08808624e-01 -1.00976110e+00 -5.39645661e-01  9.18192277e-01
   2.47488142e-01 -4.07036811e-01  1.45892171e-01 -1.29146444e+00
   1.91715559e+00  1.04094778e+00

[[ 1.38470000e+02  5.16550000e+01  9.78270000e+01  2.79800000e+01
   9.10000000e-01  1.24711000e+02  2.66600000e+00  3.06400000e+00
   4.19280000e+01  1.97760000e+02]
 [ 1.60937000e+02  6.87680000e+01  1.03235000e+02  4.81460000e+01
   2.40373503e+00  3.71783360e+02 -8.21688171e-01  3.47300000e+00
   2.07800000e+00  1.25157000e+02]
 [ 1.21858528e+02  1.62172000e+02  1.25953000e+02  3.56350000e+01
   2.40373503e+00  3.71783360e+02 -8.21688171e-01  3.14800000e+00
   9.33600000e+00  1.97814000e+02]
 [ 1.43905000e+02  8.14170000e+01  8.09430000e+01  4.14000000e-01
   2.40373503e+00  3.71783360e+02 -8.21688171e-01  3.31000000e+00
   4.14000000e-01  7.59680000e+01]
 [ 1.75864000e+02  1.69150000e+01  1.34805000e+02  1.64050000e+01
   2.40373503e+00  3.71783360e+02 -8.21688171e-01  3.89100000e+00
   1.64050000e+01  5.79830000e+01]
 [ 8.97440000e+01  1.35500000e+01  5.91490000e+01  1.16344000e+02
   2.63600000e+00  2.84584000e+02 -5.40000000e-01  1.36200000e+00
   6.16190000e+01  2.78876000e+02

In [10]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)