In [3]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%load_ext autoreload
%autoreload 2
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocessing

## Load the data

In [229]:
def load_data(path_file):
    """load data."""
    data = np.genfromtxt(
        path_file, delimiter=",", skip_header=1)
    y = np.genfromtxt(
        path_file, delimiter=",", skip_header=1, usecols=[1],dtype=str)
    y[y=='b'] = -1
    y[y=='s'] = 1
    x = np.genfromtxt(
        path_file, delimiter=",", skip_header=1, usecols=range(2,32))
    return x,y

In [230]:
DATA_FOLDER = 'Data/'
DATA_TEST = 'test.csv'
DATA_TRAIN = 'train.csv'

tx,y = load_data(DATA_FOLDER+DATA_TRAIN)
tx_sub = np.genfromtxt(DATA_FOLDER+DATA_TEST, delimiter=",", skip_header=1, usecols=range(2,32))

## Remove noisy features

In [231]:
def remove_noisy_features(x,features):
    '''removes some features (columns) of x'''
    D = x.shape[1]
    to_keep = [True]*D
    for i in features:
        to_keep[i] = False
    return x[:,to_keep]

In [232]:
def outliers_ratio(df):
    '''functions returning a pandas DataFrame, showing for each feature its percentage of outliers'''
    nb_row = len(df)
    res = {'var_name':[],'percentage_outliers':[]}
    for column in df.columns[2:]:
        nb_out = df[df[column] == -999.0][column].count()
        res['var_name'].append(column)
        res['percentage_outliers'].append(nb_out/nb_row*100)
    res = pd.DataFrame(data=res)
    return res

Remove from the data all the features with a percentage of outliers greater than `seuil_outlier`.

In [233]:
seuil_outliers = 100
# sort the features from the "noisiest" to the "cleanest"
data = pd.read_csv(DATA_FOLDER+DATA_TRAIN)
train_out = outliers_ratio(data).sort_values(ascending=False, by='percentage_outliers')
# features to remove
features = train_out[train_out.percentage_outliers >= seuil_outliers].index
tx,tx_sub = remove_noisy_features(tx,features),remove_noisy_features(tx_sub,features)

## Split train and test data

In [234]:
def split_data(x, y, ratio=0.8, seed=1):
    """
    split the dataset based on the split ratio.
    
    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.
        
    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.
        
    >>> split_data(np.arange(13), np.arange(13), 0.8, 1)
    (array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]), array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]))
    """
    # set seed
    np.random.seed(seed)
    
    N = x.shape[0]
    cut = int(N*ratio)
    permutation = np.random.permutation(N)
    x,y = x[permutation],y[permutation]
    return x[:cut],x[cut:],y[:cut],y[cut:]

In [235]:
split_ratio = 0.8
tx_tr,tx_te,y_tr,y_te = split_data(tx,y,ratio=split_ratio)

In [236]:
tx_te.shape,y_te.shape

((50000, 30), (50000,))

## Normalize data and manage outliers
We set the outliers to the mean value of the feature (calculated without the outliers), so that they are set to 0 when the standardization is done.

In [237]:
def standardize_manage_outliers(x):
    """
    set outliers to the mean value of the feature, then standardize the data
    
    Args:
        x: numpy array of shape=(N,D)
    Returns:
        A numpy array of shape (N,D) with outliers set to the mean value of the features (axis 0)
    """
    for j in range(x.shape[1]):
        mean = np.mean(x[:,j][x[:,j]!=-999.0])
        std = np.std(x[:,j][x[:,j]!=-999.0])
        x[:,j][x[:,j]==-999.0] = mean
        x[:,j] = (x[:,j]-mean)/std
    return x

In [238]:
tx_tr = standardize_manage_outliers(tx_tr)
tx_te = standardize_manage_outliers(tx_te)
tx_sub = standardize_manage_outliers(tx_sub)

## Polynomial augmentation

In [239]:
deg = 5
tx_tr_aug = build_poly(tx_tr,degree=deg)
tx_te_aug = build_poly(tx_te,degree=deg)
tx_sub_aug = build_poly(tx_sub,degree=deg)

## Other augmentations

In [240]:
features_to_fct = [i for i in range(6)]
fct1 = lambda x:np.log(np.abs(x+1e-4))*x
fct2 = lambda x:np.cos(x)/(np.abs(x)+1e-4)
fct2_1 = lambda x:np.cos(x*2)/(np.abs(2*x)+1e-4)
fct3 = lambda x:np.sin(x)
fct3_1 = lambda x:np.sin(x*2)
fct4 = lambda x: np.sin(np.exp(x))
fct5 = lambda x:np.sinc(x)
fct6 = lambda x:np.cos(x)/(1+np.exp(x))
fct7 = lambda x:np.sin(x)/(1+np.exp(x))
for fct in [fct2, fct2_1, fct3, fct3_1, fct5, fct6]:
    tx_tr_aug = add_fct(tx_tr, tx_tr_aug, features=features_to_fct, fct=fct)
    tx_te_aug = add_fct(tx_te, tx_te_aug, features=features_to_fct, fct=fct)
    tx_sub_aug = add_fct(tx_sub, tx_sub_aug, features=features_to_fct, fct=fct)

# Processing

In [241]:
def classify(y,seuil=0):
    """
    projects y on {-1,1}
    
    Args:
        y: numpy array of shape=(N,). Predictions of the model.
        seuil: float. Threshold for the projection.
    Returns:
        y: projection of the input y on {-1,1} according to the threshold
    """
    y[y<seuil] = -1
    y[y>=seuil] = 1
    return y

## Ridge Regression

In [242]:
w,loss = ridge_regression(y_tr,tx_tr_aug,lambda_=1e-5)

# Post-processing

In [248]:
def check_model(w,x_test,y_test,seuil=0):
    '''
    Function to test the accuracy of the model
    
    Args:
        w: numpy array of shape=(D,). Weights of the model.
        x_test: numpy array of shape=(N,D). Test data.
        y_test: numpy array of shape=(N,). Test labels.
    Returns:
        y: numpy array of shape=(N,). Projection of y on {-1,1} (see the classify function)
        accuracy: float. Number of good predicted labels divided by total number of prediction (N).
    '''
    y = classify(np.dot(x_test,w),seuil)
    diff= (y_test == classify(y))
    accuracy = diff.sum()/len(diff)
    return y,accuracy

In [249]:
check_model(w,tx_te_aug,y_te,seuil=0)

(array([-1.,  1., -1., ..., -1.,  1., -1.]), 0.82142)

In [221]:
degrees = [1,2,3,5,8,12]
lambdas = np.logspace(-3,1,num=6)
accuracies = np.zeros((len(degrees),len(lambdas)))

for i in range(len(degrees)):
    deg = degrees[i]
    for j in range(len(lambdas)):
        lambda_ = lambdas[j]
        tx_tr_aug = build_poly(tx_tr,degree=deg)
        tx_te_aug = build_poly(tx_te,degree=deg)
        try:
            w,loss = ridge_regression(y_tr,tx_tr_aug,lambda_=lambda_)
            accuracies[i,j] = check_model(w,tx_te_aug,y_te)[1]
        except Exception as e:
            accuracies[i,j] = -1
    print('{i}/{n}'.format(i=i,n=len(degrees)))

KeyboardInterrupt: 

Maximal param: lambda = 0.00630957344480193, degree = 5

In [830]:
ind = np.unravel_index(np.argmax(accuracies, axis=None), accuracies.shape)
accuracies[ind]

0.7957733333333333

In [833]:
print('lambda = {lambda_}, degree = {deg}'.format(deg = degrees[ind[0]],lambda_=lambdas[ind[1]]))

lambda = 0.00630957344480193, degree = 5


# Submission

In [22]:
def submit_pred(y):
    '''
    Saves the predicted labels y in the appropriate format.
    
    Args:
        y: numpy array of shape=(N,). Predicted labels.
    Returns:
        None
    '''
    ids = np.genfromtxt(DATA_FOLDER+DATA_TEST, delimiter=",", skip_header=1,usecols=[0])
    to_submit = np.zeros((y_sub.shape[0],2))
    to_submit[:,0] = ids
    to_submit[:,1] = y
    np.savetxt('submission.csv', to_submit,delimiter=',',header='Id,Prediction',comments='')
    print('Successfully saved')

In [23]:
y_sub = classify(np.dot(tx_sub_aug,w))
y_sub

array([-1., -1.,  1., ...,  1., -1., -1.])

In [24]:
len(y_sub[y_sub==1])/len(y_sub)

0.29279280864708096

In [25]:
submit_pred(y_sub)

Successfully saved
