In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import time
%load_ext autoreload
%autoreload 2

In [2]:
start = time.time()
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)


In [3]:
def split_data(x, y, ratio,seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # ***************************************************
    # INSERT YOUR CODE HERE
    # split the data based on the given ratio: TODO
    # ***************************************************
    indices = np.random.permutation(x.shape[0])
    training_ratio = int(np.floor(ratio * x.shape[0]))

    x_training = x[indices[0:training_ratio]]
    y_training = y[indices[0:training_ratio]]
    x_testing = x[indices[training_ratio:]]
    y_testing = y[indices[training_ratio:]]
    
    return x_training, x_testing, y_training, y_testing

In [4]:
# Removing bothering data and centering
tX[tX==-999] = 0
m = np.mean(tX, axis=0)
centered_tX = tX - m

centered_tX[centered_tX==0] = float('nan')
stdevtrain = np.nanstd(centered_tX, axis=0)
centered_tX[centered_tX==float('nan')] = 0
standardized_tX = centered_tX / stdevtrain

d = len(standardized_tX[0])
n = len(standardized_tX)

indices_s_deg = []
indices_t_deg = []

print("Creating indices...")
# Creating indices for subsets of degree 2
for i in range (d):
    for t in range (i,d):
        indices_s_deg.append([t, i])
indices_s_deg = np.array(indices_s_deg).T

# Creating indices for subsets of degree 3
max_t_degree = 2
for i in range (max_t_degree):
    for t in range (i,max_t_degree):
        for j in range(t,max_t_degree):
            if not (i == t and i == j):
                indices_t_deg.append([j, t, i])
indices_t_deg = np.array(indices_t_deg).T

degrees = range(3,11)
degrees_number = len(degrees) + 1
stdX_Ncols = standardized_tX.shape[1]
indices_s_Ncols = indices_s_deg.shape[1]
indices_t_Ncols = indices_t_deg.shape[1]

number_of_rows = indices_s_Ncols + degrees_number * stdX_Ncols + indices_t_Ncols

mat = np.zeros((n, number_of_rows))

print("Computing first degree...")
# First degree
mat[:, :stdX_Ncols] = standardized_tX

print("Computing second degree with combinations...")
# Second degree gotten from indices
mat[:,stdX_Ncols:stdX_Ncols + indices_s_Ncols] = standardized_tX[:, indices_s_deg[0]] * standardized_tX[:, indices_s_deg[1]]

print("Computing from degree 3 to 10 without combinations...")
# Improve 3 to 10 degree
for i in degrees:
    start_index = indices_s_Ncols + (i - 2) * stdX_Ncols
    end_index = start_index + stdX_Ncols
    mat[:,start_index:end_index] = standardized_tX**i
    
print("Computing third degree with some combinations...")
# Third degree gotten from indices
mat[:, number_of_rows - indices_t_Ncols: number_of_rows] = standardized_tX[:, indices_t_deg[0]] * standardized_tX[:, indices_t_deg[1]] * standardized_tX[:, indices_t_deg[2]]

Creating indices...
Computing first degree...
Computing second degree with combinations...
Computing from degree 3 to 10 without combinations...
Computing third degree with some combinations...


In [5]:
# Centering again
m2 = np.mean(mat, axis=0)
centered_mat = mat - m2
centered_mat[mat==0] = 0

centered_mat[centered_mat==0] = float('nan')
stdev = np.nanstd(centered_mat, axis=0)
centered_mat[centered_mat==float('nan')] = 0
standardized_mat = centered_mat / stdev

In [6]:
num_samples = len(standardized_mat)
tx = np.c_[np.ones(num_samples), standardized_mat]
#y=list(y)
#y[y==-1]=0

In [7]:
def compute_loss(y, tx, w):
    """Calculate the loss.
    
    You can calculate the loss using mse or mae.
    """
    e = y - np.dot(tx, w)
    mse = np.dot(e.transpose(), e) / (2 * len(tx))
    return mse

In [8]:
tX, testx, y, testy = split_data(tx, y, 0.8, seed=1)

In [9]:
from __future__ import division
import numpy as np 
def logistic(a):
    return 1.0 / (1 + np.exp(-a))
def irls(X, y):
    theta = np.zeros(X.shape[1])
    theta_ = np.inf
    eps = 60000
    lamda = 5
    for aqua in range (11):
        grad = np.zeros(X.shape[1])
        a = np.dot(X, theta)
        #a=math.log10(abs(a))*sign(a)
        #print(min(a),max(a))
        pi = logistic(a)        
        SX = X * (pi - pi*pi).reshape(-1, 1)
        XSX = np.dot(X.T, SX)
        del(SX)
        #+lamda*np.eye((len(X[0])))
        for aw in range (len(X)):
            grad = grad + (-1 / len(X)) * (y[aw] * X[aw, :] * logistic(-y[aw] * np.dot(X[aw,:], theta)))
        
        
     
        theta = theta - eps * np.linalg.solve(XSX, grad)
        print(sum(y == np.sign(np.dot(X, theta))) / len(y))
       
        if aqua % 5 == 0 and aqua != 0:
            eps = eps * 0.8
    return theta

In [10]:
print("Freeing memory")
del(centered_mat, centered_tX, standardized_mat, standardized_tX, stdX_Ncols)
del(indices_s_deg, indices_s_Ncols, indices_t_deg, indices_t_Ncols)
del(mat, DATA_TRAIN_PATH, ids, stdev, testx, testy)

lens = [(x,len(x)) for x in set(dir()) - set(dir(__builtins__))]
testout = sorted(lens, key=lambda l: l[1])
print(testout[::-1])

Freeing memory
[('create_csv_submission', 21), ('degrees_number', 14), ('predict_labels', 14), ('number_of_rows', 14), ('load_csv_data', 13), ('__builtins__', 12), ('max_t_degree', 12), ('compute_loss', 12), ('num_samples', 11), ('__builtin__', 11), ('start_index', 11), ('split_data', 10), ('stdevtrain', 10), ('end_index', 9), ('logistic', 8), ('division', 8), ('degrees', 7), ('start', 5), ('irls', 4), ('_iii', 4), ('time', 4), ('quit', 4), ('_i10', 4), ('exit', 4), ('_i3', 3), ('plt', 3), ('_i8', 3), ('_i7', 3), ('_dh', 3), ('_i2', 3), ('___', 3), ('_ih', 3), ('_i1', 3), ('_oh', 3), ('_sh', 3), ('_i6', 3), ('csv', 3), ('_ii', 3), ('Out', 3), ('_i5', 3), ('_i9', 3), ('_i4', 3), ('In', 2), ('__', 2), ('tx', 2), ('np', 2), ('tX', 2), ('m2', 2), ('_i', 2), ('i', 1), ('n', 1), ('m', 1), ('j', 1), ('_', 1), ('t', 1), ('y', 1), ('d', 1)]


In [11]:
weights=irls(tX,y)

0.826105
0.82726
0.828775
0.82999
0.83154
0.8327
0.83368
0.8346




0.835045
0.83548
0.83587


In [12]:
print("Freeing memory")
del(y, tX)

Freeing memory


In [13]:
DATA_TEST_PATH = 'test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [14]:
testx = tX_test
print(weights.shape)
print(testx.shape)
testx[testx==-999] = 0
centered_testx = testx - m
centered_testx[testx==-999] = 0
standardized_testx = centered_testx / stdevtrain

d = len(standardized_testx[0])
n = len(standardized_testx)

indices_s_deg = []
indices_t_deg = []

print("Creating indices...")
# Creating indices for subsets of degree 2
for i in range (d):
    for t in range (i,d):
        indices_s_deg.append([t, i])
indices_s_deg = np.array(indices_s_deg).T

# Creating indices for subsets of degree 3
max_t_degree = 11
for i in range (max_t_degree):
    for t in range (i,max_t_degree):
        for j in range(t,max_t_degree):
            if not (i == t and i == j):
                indices_t_deg.append([j, t, i])
indices_t_deg = np.array(indices_t_deg).T

degrees = range(3,11)
degrees_number = len(degrees) + 1
stdX_Ncols = standardized_testx.shape[1]
indices_s_Ncols = indices_s_deg.shape[1]
indices_t_Ncols = indices_t_deg.shape[1]

number_of_rows = indices_s_Ncols + degrees_number * stdX_Ncols + indices_t_Ncols

mat = np.zeros((n, number_of_rows))

print("Computing first degree...")
# First degree
mat[:, :stdX_Ncols] = standardized_testx

print("Computing second degree with combinations...")
# Second degree gotten from indices
mat[:,stdX_Ncols:stdX_Ncols + indices_s_Ncols] = standardized_testx[:, indices_s_deg[0]] * standardized_testx[:, indices_s_deg[1]]

print("Computing from degree 3 to 10 without combinations...")
# Improve 3 to 10 degree
for i in degrees:
    start_index = indices_s_Ncols + (i - 2) * stdX_Ncols
    end_index = start_index + stdX_Ncols
    mat[:,start_index:end_index] = standardized_testx**i
    
print("Computing third degree with some combinations...")
# Third degree gotten from indices
mat[:, number_of_rows - indices_t_Ncols: number_of_rows] = standardized_testx[:, indices_t_deg[0]] * standardized_testx[:, indices_t_deg[1]] * standardized_testx[:, indices_t_deg[2]]  

m2 = np.mean(mat, axis=0)
centered_mat = mat - m2
centered_mat[mat==0] = 0

print("Freeing memory")
del(standardized_testx, stdX_Ncols)
del(indices_s_deg, indices_s_Ncols, indices_t_deg, indices_t_Ncols)
del(mat, DATA_TEST_PATH, testx, centered_testx)

lens = [(x,len(x)) for x in set(dir()) - set(dir(__builtins__))]
testout = sorted(lens, key=lambda l: l[1])
print(testout[::-1])

centered_mat[centered_mat==0]=float('nan')
stdev=np.nanstd(centered_mat,axis=0)
centered_mat[centered_mat==float('nan')]=0
standardized_testmat = centered_mat / stdev

print("Freeing memory")
del(centered_mat, stdev)

num_samples = len(standardized_testmat)
final_testx = np.c_[np.ones(num_samples), standardized_testmat]

(738,)
(568238, 30)
Creating indices...
Computing first degree...
Computing second degree with combinations...
Computing from degree 3 to 10 without combinations...
Computing third degree with some combinations...
Freeing memory
[('create_csv_submission', 21), ('degrees_number', 14), ('predict_labels', 14), ('number_of_rows', 14), ('load_csv_data', 13), ('__builtins__', 12), ('centered_mat', 12), ('max_t_degree', 12), ('compute_loss', 12), ('num_samples', 11), ('__builtin__', 11), ('start_index', 11), ('split_data', 10), ('stdevtrain', 10), ('end_index', 9), ('logistic', 8), ('division', 8), ('ids_test', 8), ('tX_test', 7), ('degrees', 7), ('weights', 7), ('testout', 7), ('start', 5), ('irls', 4), ('_i12', 4), ('_iii', 4), ('time', 4), ('lens', 4), ('quit', 4), ('_i11', 4), ('_i14', 4), ('_i10', 4), ('exit', 4), ('_i13', 4), ('_i3', 3), ('plt', 3), ('_i8', 3), ('_i7', 3), ('_dh', 3), ('_i2', 3), ('___', 3), ('_ih', 3), ('_i1', 3), ('_oh', 3), ('_sh', 3), ('_i6', 3), ('csv', 3), ('_ii',

In [15]:
OUTPUT_PATH = 'deneme8.csv' # TODO: fill in desired name of output file for submission
y_pred = -predict_labels(weights, final_testx)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

ValueError: shapes (568238,1011) and (738,) not aligned: 1011 (dim 1) != 738 (dim 0)

In [16]:
end = time.time()
print("New time for everything:", end - start, "seconds.")

New time for everything: 1160.2428402900696 seconds.
