In [1]:
# Useful starting lines
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here

We load the training data into our y (labels), tX (input matrix) and ids (indexes)

In [3]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Pre-processing functions by Victor

In [4]:
from helpers import standardize

def na(x):
    return np.any(x == -999)

def pos(x):
    return np.all(x > 0)

def process_data(data, impute_strategy='most_frequent'):
    ''' This function add data where we have missing data following different strategies'''

    # Impute missing data
    for i in range(data.shape[1]):
        
        # If NA values in column
        if (na(data[:, i])):
            mask = (data[:, i] != -999.)
            affectedColumns = data[mask, i]
            
            if impute_strategy == 'most_frequent':
                values, counts = np.unique(affectedColumns, return_counts=True)
                data[~mask, i] = values[np.argmax(counts)]
            elif impute_strategy == 'median':
                data[~mask, i] = np.median(affectedColumns)
            elif impute_strategy == 'mean':
                data[~mask, i] = np.mean(affectedColumns)
            else:
                raise ValueError("The given impute strategy is invalid")
            
    inv_log_cols = [0, 2, 5, 7, 9, 10, 13, 16, 19, 21, 23, 26]

    # Create inverse log values of features which are positive in value.
    data_inv_log_cols = np.log(1 / (1 + data[:, inv_log_cols]))
    processed_data = np.hstack((data, data_inv_log_cols))
            
    return processed_data

## Pre-processing of input data

Here we decide if we want to pre-process our data or not launching this code.

In [5]:
tX = process_data(tX)
tX, mean_tX, std_tX = standardize(tX)

## Least squares

In [6]:
from least_squares import least_squares

weights = least_squares(y, tX)

OUTPUT_PATH = 'data/output.csv' # TODO: fill in desired name of output file for submission

## Import the test data

In [12]:
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

Process it (or not)

In [13]:
tX_test = process_data(tX_test)
tX_test, _, _ = standardize(tX_test, mean_tX, std_tX)

## Generate predictions and save ouput in csv format for submission:

In [15]:
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)