In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
from implementations import *
%load_ext autoreload
%autoreload 2

## data preperation

### file opening

In [2]:
File_test = open('test.csv')
File_train = open('train.csv')

data_test = np.array(list(csv.reader(File_test)))
data_train = np.array(list(csv.reader(File_train)))

File_test.close()
File_train.close()

### features 

In [3]:
def make_features(X):
    # converting -999. to nan to use np.nanmean and np.nanstd
    X = np.where(X == -999., np.nan, X)
    # standardizing the data Xd = (X_d - E[X_d])/(std(X_d))
    X, means, stds = standardize(X)
    # since data is standirdized, the mean is more or less 0 for each feature so replacing by zero is reasonable and helps computations
    X = np.where(np.isnan(X), 0, X)
    # adding the 1 padding
    return np.column_stack((np.ones(X.shape[0]), X))

# converting the strings into floats and removing features names, labels and indexes
X = np.array(data_train[1:,2:]).astype(np.float)

num_train = 150000
num_val = 50000
num_test = 50000
N = X.shape[0]
assert num_train + num_val + num_test == N

training_set = make_features(X[:num_train])
validation_set = make_features(X[num_train:num_train+num_val])
test_set = make_features(X[-num_test:])

In [4]:
print(training_set.shape)
print(validation_set.shape)
print(test_set.shape)

(150000, 31)
(50000, 31)
(50000, 31)


### labels

In [5]:
def make_labels(data):
    labels = np.array(data[1:,1])
    return np.where(labels == 'b', 1, 0)
    
training_labels = make_labels(data_train[:num_train+1])
validation_labels = make_labels(data_train[num_train:num_train+num_val+1])
test_labels = make_labels(data_train[-num_test-1:])

In [6]:
print(training_labels.shape)
print(validation_labels.shape)
print(test_labels.shape)

(150000,)
(50000,)
(50000,)


## learning

In [12]:
losses = []
ws = []

for gamma in np.linspace(0.1,1,10):
    loss, w = least_squares_GD(validation_labels, validation_set, np.zeros(validation_set.shape[1]), 100, gamma)
    losses.append(loss)
    ws.append(w)

In [18]:
w = ws[np.argmin(losses)]

## test data

In [19]:
pred_tr = training_set @ w
pred_tr = np.where(pred_tr > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(training_labels - pred_tr)) / training_labels.shape[0]
print("accuracy on training set is of {} %".format(accuracy))

accuracy on training set is of 74.44533333333334 %


In [20]:
pred = test_set @ w
pred = np.where(pred > 1/2, 1, 0)
accuracy = 100 - 100 * np.sum(np.abs(test_labels - pred)) / test_labels.shape[0]
print("accuracy on test set is of {} %".format(accuracy))

accuracy on test set is of 74.386 %
