# **TODO**
* Cross-validation
* Features cross products
* Square and cube roots
* phi features overfit?

# Import libraries

In [1]:
%matplotlib inline 
import numpy as np   # generic stuff
import matplotlib.pyplot as plt

from lib.proj1_helpers import * #the helper provided for the project

from implementations import * #our implementations of the functions done by us
from helpers import *
import datetime
# Useful starting lines

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Loading the training and the testing sets

In [2]:
DATA_FOLDER = 'data/'

y_train, tx_train, ids_train = load_csv_data(DATA_FOLDER+'train.csv',sub_sample=False)

y_test, tx_test, ids_test = load_csv_data(DATA_FOLDER+'test.csv',sub_sample=False)

In [3]:
y_train

array([ 1., -1., -1., ...,  1., -1., -1.])

In [4]:
tx_train

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ..., 
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

# Handling missing values (-999)

In [5]:
def clean_missing_values(tx):
    nan_values = (tx==-999)*1
    for col in range(tx.shape[1]):
        column = tx[:,col][tx[:,col]!=-999]
        median = np.median(column)
        tx[:,col][tx[:,col]==-999] = median
    return tx, nan_values

In [6]:
tx_train, nan_values_train = clean_missing_values(tx_train)
tx_test, nan_values_test = clean_missing_values(tx_test)

In [7]:
def add_nan_feature(tx, nan_values):
    cols_wo_nan_ids = [i for i in range(nan_values_train.shape[1]) if np.prod((nan_values_train.T[i]==0)*1)==1]
    nan_values = np.delete(nan_values, cols_wo_nan_ids, axis=1)
    return np.concatenate((tx, nan_values), axis=1)

In [8]:
print(tx_train.shape)
tx_train = add_nan_feature(tx_train, nan_values_train)
print(tx_train.shape)

(250000, 30)
(250000, 41)


In [9]:
tx_test = add_nan_feature(tx_test, nan_values_test)

In [10]:
print(tx_train.shape)

(250000, 41)


# Manage categorical data (feature 22)

In [11]:
tx_train[:,22]

array([ 2.,  1.,  1., ...,  1.,  0.,  0.])

In [12]:
def manage_categorical_data(tx):
    idx_cat = 22
    larger_tx = np.zeros([tx.shape[0],4])
    larger_tx[:,0] = (tx[:,idx_cat]==0)*1
    larger_tx[:,1] = (tx[:,idx_cat]==1)*1
    larger_tx[:,2] = (tx[:,idx_cat]==2)*1
    larger_tx[:,3] = (tx[:,idx_cat]==3)*1
    tx = np.delete(tx, idx_cat, axis=1)
    tx = np.concatenate((tx, larger_tx), axis=1)
    return tx

In [13]:
print(tx_train.shape)
tx_train = manage_categorical_data(tx_train)
print(tx_train.shape)

(250000, 41)
(250000, 44)


In [14]:
tx_test = manage_categorical_data(tx_test)

# Standardization of data

In [15]:
tx_train = standardize(tx_train)
tx_train = tx_train[0]
tx_test = standardize(tx_test)
tx_test = tx_test[0]

# Add a column of all ones

In [16]:
def add_ones(tx):
    return np.concatenate((tx, np.ones([tx.shape[0],1])), axis=1)

In [17]:
tx_train = add_ones(tx_train)
tx_test = add_ones(tx_test)

# Add powers of the columns

In [18]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=1 up to j=degree."""
    return np.array([x**p for p in range(1,degree+1)]).T

In [19]:
def add_powers(tx, degree):
    for col in range(tx.shape[1]):
        tx = np.concatenate((tx, build_poly(tx[:,col], degree)), axis=1)
    return tx

In [20]:
deg = 11
tx_train = add_powers(tx_train, deg)
tx_test = add_powers(tx_test, deg)

# Submission trials

## Least squares

## Gradient descent

## Stochastic gradient descent

## Ridge regression

In [21]:
max_iters = 100
lambda_ = 1
weights = ridge_regression(y_train, tx_train, lambda_);
print('losses: \n','\n\n','weights: \n',weights)

losses: 
 

 weights: 
 [  1.39069172e-02  -5.20547866e-02   8.79203751e-03   9.02012251e-03
   5.54651156e-03  -2.07207833e-02   5.52977045e-03   5.91592391e-03
   1.84893597e-04   2.56195774e-03   4.67977531e-03   6.90215333e-03
   5.78494471e-03   1.89410495e-02   5.75738141e-03   5.71183808e-03
  -8.32963127e-03   5.77115749e-03   5.79659569e-03  -1.04500544e-02
   5.82350941e-03  -4.84224176e-03  -3.00635895e-03   5.76233238e-03
   5.77744287e-03  -2.36503999e-03   5.78067360e-03   5.74494364e-03
   3.48055613e-03   5.35626772e-03   5.63251834e-03   5.63296048e-03
   5.63296048e-03   5.63296048e-03   5.58818558e-03   5.58900226e-03
   5.58900226e-03   5.63296174e-03   5.63296174e-03   5.63296174e-03
   5.58975420e-03   5.81151960e-03   5.79777582e-03   5.67810854e-03
  -1.47450757e-02   1.39069320e-02   2.53399493e-02   2.02345213e-02
  -1.71776808e-03  -1.80300917e-02   9.01117402e-03  -1.86973918e-03
   2.01789602e-04  -1.17530382e-05   3.44164181e-07  -3.84892782e-09
  -5.20547

In [22]:
y_pred = predict_labels(weights, tx_test)

In [23]:
name = 'output/ridge_regression_ondine'
create_csv_submission(ids_test, y_pred, name)

## Logistic regression

## Regularized logistic regression

## Newton regularized logistic regression