 # Useful imports


In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocessing

## Load the data

In [None]:
def load_data(path_file):
    """load data."""
    data = np.genfromtxt(
        path_file, delimiter=",", skip_header=1)
    y = np.genfromtxt(
        path_file, delimiter=",", skip_header=1, usecols=[1],dtype=str)
    y[y=='b'] = -1
    y[y=='s'] = 1
    x = np.genfromtxt(
        path_file, delimiter=",", skip_header=1, usecols=range(2,32))
    return x,y

In [None]:
DATA_FOLDER = 'Data/'
DATA_TEST = 'test.csv'
DATA_TRAIN = 'train.csv'

tx,y = load_data(DATA_FOLDER+DATA_TRAIN)
tx_sub = np.genfromtxt(DATA_FOLDER+DATA_TEST, delimiter=",", skip_header=1, usecols=range(2,32))

## Normalize data and manage outliers
We set the outliers to the mean value of the feature (calculated without the outliers), so that they are set to 0 when the standardization is done.

In [None]:
def standardize_manage_outliers(x):
    """
    set outliers to the mean value of the feature, then standardize the data
    
    Args:
        x: numpy array of shape=(N,D)
    Returns:
        A numpy array of shape (N,D) with outliers set to the mean value of the features (axis 0)
    """
    for j in range(x.shape[1]):
        mean = np.mean(x[:,j][x[:,j]!=-999.0])
        std = np.std(x[:,j][x[:,j]!=-999.0])
        x[:,j][x[:,j]==-999.0] = mean
        x[:,j] = (x[:,j]-mean)/std
    return x

In [None]:
tx = standardize_manage_outliers(tx)
tx_sub = standardize_manage_outliers(tx_sub)

## Polynomial augmentation

In [None]:
deg = 5
tx_tr_aug = build_poly(tx_tr,degree=deg)
tx_te_aug = build_poly(tx_te,degree=deg)
tx_sub_aug = build_poly(tx_sub,degree=deg)

## Other augmentations

In [None]:
features_to_fct = [i for i in range(6)]
fct1 = lambda x:np.log(np.abs(x+1e-4))*x
fct2 = lambda x:np.cos(x)/(np.abs(x)+1e-4)
fct2_1 = lambda x:np.cos(x*2)/(np.abs(2*x)+1e-4)
fct3 = lambda x:np.sin(x)
fct3_1 = lambda x:np.sin(x*2)
fct4 = lambda x: np.sin(np.exp(x))
fct5 = lambda x:np.sinc(x)
fct6 = lambda x:np.cos(x)/(1+np.exp(x))
fct7 = lambda x:np.sin(x)/(1+np.exp(x))
for fct in [fct2, fct2_1, fct3, fct3_1, fct5, fct6]:
    tx_tr_aug = add_fct(tx_tr, tx_tr_aug, features=features_to_fct, fct=fct)
    tx_te_aug = add_fct(tx_te, tx_te_aug, features=features_to_fct, fct=fct)
    tx_sub_aug = add_fct(tx_sub, tx_sub_aug, features=features_to_fct, fct=fct)

# Processing

In [None]:
def classify(y,seuil=0):
    """
    projects y on {-1,1}
    
    Args:
        y: numpy array of shape=(N,). Predictions of the model.
        seuil: float. Threshold for the projection.
    Returns:
        y: projection of the input y on {-1,1} according to the threshold
    """
    y[y<seuil] = -1
    y[y>=seuil] = 1
    return y

## Ridge Regression

In [None]:
w,loss = ridge_regression(y_tr,tx_tr_aug,lambda_=1e-5)

# Post-processing

In [None]:
def check_model(w,x_test,y_test,seuil=0):
    '''
    Function to test the accuracy of the model
    
    Args:
        w: numpy array of shape=(D,). Weights of the model.
        x_test: numpy array of shape=(N,D). Test data.
        y_test: numpy array of shape=(N,). Test labels.
    Returns:
        y: numpy array of shape=(N,). Projection of y on {-1,1} (see the classify function)
        accuracy: float. Number of good predicted labels divided by total number of prediction (N).
    '''
    y = classify(np.dot(x_test,w),seuil)
    diff= (y_test == classify(y))
    accuracy = diff.sum()/len(diff)
    return y,accuracy

In [None]:
check_model(w,tx_te_aug,y_te,seuil=0)

(array([-1.,  1., -1., ..., -1.,  1., -1.]), 0.82142)

# Submission

In [None]:
def submit_pred(y):
    '''
    Saves the predicted labels y in the appropriate format.
    
    Args:
        y: numpy array of shape=(N,). Predicted labels.
    Returns:
        None
    '''
    ids = np.genfromtxt(DATA_FOLDER+DATA_TEST, delimiter=",", skip_header=1,usecols=[0])
    to_submit = np.zeros((y_sub.shape[0],2))
    to_submit[:,0] = ids
    to_submit[:,1] = y
    np.savetxt('submission.csv', to_submit,delimiter=',',header='Id,Prediction',comments='')
    print('Successfully saved')

In [None]:
y_sub = classify(np.dot(tx_sub_aug,w))
y_sub

array([-1., -1.,  1., ...,  1., -1., -1.])

In [None]:
submit_pred(y_sub)

Successfully saved
