In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import time
%load_ext autoreload
%autoreload 2

In [2]:
start = time.time()
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)


In [3]:
def split_data(x, y, ratio,seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # ***************************************************
    # INSERT YOUR CODE HERE
    # split the data based on the given ratio: TODO
    # ***************************************************
    indices = np.random.permutation(x.shape[0])
    training_ratio = int(np.floor(ratio * x.shape[0]))

    x_training = x[indices[0:training_ratio]]
    y_training = y[indices[0:training_ratio]]
    x_testing = x[indices[training_ratio:]]
    y_testing = y[indices[training_ratio:]]
    
    return x_training, x_testing, y_training, y_testing

In [4]:
# Removing bothering data and centering
tX[tX==-999] = 0
m = np.mean(tX, axis=0)
centered_tX = tX - m

centered_tX[centered_tX==0] = float('nan')
stdevtrain = np.nanstd(centered_tX, axis=0)
centered_tX[centered_tX==float('nan')] = 0
standardized_tX = centered_tX / stdevtrain

d = len(standardized_tX[0])
n = len(standardized_tX)

indices_s_deg = []
indices_t_deg = []

print("Creating indices...")
# Creating indices for subsets of degree 2
for i in range (d):
    for t in range (i,d):
        indices_s_deg.append([t, i])
indices_s_deg = np.array(indices_s_deg).T

# Creating indices for subsets of degree 3
max_t_degree = 15
for i in range (max_t_degree):
    for t in range (i,max_t_degree):
        for j in range(t,max_t_degree):
            if not (i == t and i == j):
                indices_t_deg.append([j, t, i])
indices_t_deg = np.array(indices_t_deg).T

degrees = range(3,11)
degrees_number = len(degrees) + 1
stdX_Ncols = standardized_tX.shape[1]
indices_s_Ncols = indices_s_deg.shape[1]
indices_t_Ncols = indices_t_deg.shape[1]

number_of_rows = indices_s_Ncols + degrees_number * stdX_Ncols + indices_t_Ncols

mat = np.zeros((n, number_of_rows))

print("Computing first degree...")
# First degree
mat[:, :stdX_Ncols] = standardized_tX

print("Computing second degree with combinations...")
# Second degree gotten from indices
mat[:,stdX_Ncols:stdX_Ncols + indices_s_Ncols] = standardized_tX[:, indices_s_deg[0]] * standardized_tX[:, indices_s_deg[1]]

print("Computing from degree 3 to 10 without combinations...")
# Improve 3 to 10 degree
for i in degrees:
    start_index = indices_s_Ncols + (i - 2) * stdX_Ncols
    end_index = start_index + stdX_Ncols
    mat[:,start_index:end_index] = standardized_tX**i
    
print("Computing third degree with some combinations...")
# Third degree gotten from indices
mat[:, number_of_rows - indices_t_Ncols: number_of_rows] = standardized_tX[:, indices_t_deg[0]] * standardized_tX[:, indices_t_deg[1]] * standardized_tX[:, indices_t_deg[2]]


m2 = np.mean(mat, axis=0)
centered_mat = mat - m2
centered_mat[mat==0] = 0

centered_mat[centered_mat==0] = float('nan')
stdev = np.nanstd(centered_mat, axis=0)
centered_mat[centered_mat==float('nan')] = 0
standardized_mat = centered_mat / stdev

num_samples = len(standardized_mat)
tx = np.c_[np.ones(num_samples), standardized_mat]

Creating indices...
Computing first degree...
Computing second degree with combinations...
Computing from degree 3 to 10 without combinations...
Computing third degree with some combinations...


In [5]:
def compute_loss(y, tx, w):
    """Calculate the loss.
    
    You can calculate the loss using mse or mae.
    """
    e = y - np.dot(tx, w)
    mse = np.dot(e.transpose(), e) / (2 * len(tx))
    return mse

In [6]:
from __future__ import division
import numpy as np 
def logistic(a):
    return 1.0 / (1 + np.exp(-a))
def irls(X, y):
    theta = np.zeros(X.shape[1])
    theta_ = np.inf
    eps=50000
    for aqua in range (20):
        grad=np.zeros(X.shape[1])
        a = np.dot(X, theta)
        pi = logistic(a)        
        SX = X * (pi - pi*pi).reshape(-1,1)
        XSX = np.dot(X.T,SX)
        
        for aw in range (len(X)):
            grad = grad + (-1 / len(X)) * (y[aw] * X[aw,:] * logistic(-y[aw] * np.dot(X[aw,:],theta)))
     
        theta = theta - eps * np.linalg.solve(XSX, grad)
        print(sum(y==np.sign(np.dot(X, theta))) / len(y))
       
        if aqua % 5==0 and aqua ! =0:
            eps = eps * 0.5
    return theta

def reg_irls(X, y):
    theta = np.zeros(X.shape[1])
    theta_ = np.inf
    eps = 100000
    lamda = 10**-8
    for aqua in range (15):
        grad = np.zeros(X.shape[1])
        a = np.dot(X, theta)
        pi = logistic(a)        
        SX = X * (pi - pi*pi).reshape(-1,1)
        XSX = np.dot(X.T,SX) + lamda * np.eye((len(X[0])))
        for aw in range (len(X)):
            grad = grad + (-1 / len(X)) * (y[aw] * X[aw,:] * logistic(-y[aw] * np.dot(X[aw,:],theta)))
        
        theta = theta - eps * np.linalg.solve(XSX, grad) - eps * lamda * theta
        print(sum(y==np.sign(np.dot(X, theta))) / len(y))
       
        if aqua % 5==0 and aqua != 0:
            eps = eps * 0.5
    return theta

In [None]:
print("Freeing memory")
del(centered_mat, centered_tX, standardized_mat, standardized_tX, stdX_Ncols)
del(indices_s_deg, indices_s_Ncols, indices_t_deg, indices_t_Ncols)
del(mat, DATA_TRAIN_PATH, ids, stdev, testx, testy)

lens = [(x,len(x)) for x in set(dir()) - set(dir(__builtins__))]
testout = sorted(lens, key=lambda l: l[1])
print(testout[::-1])

In [7]:
def least_squares(y, tx):
    """calculate the least squares solution."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # least squares: TODO
    # returns mse, and optimal weights
    xtx=np.dot(tx.transpose(),tx)
    xy=np.dot(tx.transpose(),y)
    w=np.dot(np.linalg.inv(xtx),xy)
    #e=y-np.dot(tx,w)
    #mse=np.dot(e.transpose(),e)/(2*len(tx))
    return w
    # ***************************************************

In [8]:
def ridge_regression(y, tx, lamb):
    """implement ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # ridge regression: TODO
    xtx=np.dot(tx.transpose(),tx)
    l_inside=2*lamb*len(tx)*np.eye(tx.shape[1])
    ins=xtx+l_inside
    xy=np.dot(tx.transpose(),y)
    
    w=np.dot(np.linalg.inv(ins),xy)
    e=y-np.dot(tx,w)
    mse=np.dot(e.transpose(),e)/(2*len(tx))
    
    return w
    # ***************************************************
    raise NotImplementedError

In [9]:
def compute_gradient(y, tx, w):
    """Compute the gradient."""
    # ***************************************************
    e=y-np.dot(tx,w)
    return (-1/len(tx))*np.dot(tx.transpose(),e)
    # ***************************************************
    raise NotImplementedError

In [10]:
def least_squares_GD(y, tx, gamma, max_iters):
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    initial_w=np.zeros(tx.shape[1])
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        # ***************************************************
        # INSERT YOUR CODE HERE
        # TODO: compute gradient and loss
        grad=compute_gradient(y,tx,w)
        loss=compute_loss(y,tx,w)
        # ***************************************************
        gamma=gamma/1.005
        # ***************************************************
        # INSERT YOUR CODE HERE
        # TODO: update w by gradient
        w=w-gamma*grad
        # ***************************************************
        
        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)
        print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return losses, ws

In [None]:
max_iters = 500
gamma = 0.025
gradient_losses, gradient_w = least_squares_GD(y1, x1, gamma, max_iters)

In [None]:
least_squares_w=least_squares(y1,x1)

In [None]:
lamb=0.00075
ridge_w=ridge_regression(y1, x1, lamb)

In [11]:
a=[i for i in range (200000,250000)]
b=[i for i in range (150000,200000)]
c=[i for i in range (100000,150000)]
d=[i for i in range (50000,100000)]
e=[i for i in range (0,50000)]


In [12]:
x1=np.delete(tx,a,axis=0)
x2=np.delete(tx,b,axis=0)
x3=np.delete(tx,c,axis=0)
x4=np.delete(tx,d,axis=0)
x5=np.delete(tx,e,axis=0)
y1=np.delete(y,a,axis=0)
y2=np.delete(y,b,axis=0)
y3=np.delete(y,c,axis=0)
y4=np.delete(y,d,axis=0)
y5=np.delete(y,e,axis=0)

In [13]:
#reg_logistic_w=reg_irls(tX,y)
print(1)
reg_logistic_w1=reg_irls(x1,y1)
print(2)
reg_logistic_w2=reg_irls(x2,y2)
print(3)
reg_logistic_w3=reg_irls(x3,y3)
print(4)
reg_logistic_w4=reg_irls(x4,y4)
print(5)
reg_logistic_w5=reg_irls(x5,y5)
reg_logistic_w=(reg_logistic_w1+reg_logistic_w2+reg_logistic_w3+reg_logistic_w4+reg_logistic_w5)/5

1
0.831255
0.83283
0.83489
0.836345
0.83785
0.83883
0.839195
0.83941
0.83966
0.839865
0.840065
0.839985
0.83996
0.839925




0.84002
2
0.83209
0.833725
0.835805
0.837335
0.83819
0.83903
0.83941
0.83934
0.839585
0.83977
0.839985
0.839875
0.839865
0.8399
0.84001
3
0.83092
0.832745
0.83501
0.836465
0.837715
0.83835
0.83864
0.83901
0.83902
0.8391
0.839245
0.83931
0.83933
0.8394
0.839455
4
0.831585
0.833315
0.835325
0.83676
0.838035
0.838905
0.83914
0.83956
0.83977
0.839845
0.84006
0.840055
0.840145
0.84024
0.840295
5
0.83185
0.83351
0.83554
0.83706
0.838285
0.839295
0.839495
0.83968
0.839745
0.83974
0.83968
0.83971
0.839785
0.83982
0.83986


In [None]:
logistic_w=irls(tX,y)

In [None]:
DATA_TEST_PATH = '/Users/alperkose/Desktop/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
print("Freeing memory")
del(mat)
del(DATA_TRAIN_PATH)
del(y)
del(tx)
del(ids)
weights = reg_logistic_w
testx = tX_test
#testx=np.delete(testx,[14,15,17,18,24,25,27,28],axis=1)
testx[testx==-999] = 0
#m2=np.mean(testx,axis=0)
centered_testx = testx - m
centered_testx[testx==-999] = 0
#standardized_testx=centered_testx / np.std(centered_testx, axis=0)
#centered_testx[centered_testx==0]=float('nan')
#stdevtest=np.nanstd(centered_testx,axis=0);
#centered_testx[centered_testx==float('nan')]=0
standardized_testx = centered_testx / stdevtrain

d = len(standardized_testx[0])
n = len(standardized_testx)

indices_s_deg = []
indices_t_deg = []

print("Creating indices...")
# Creating indices for subsets of degree 2
for i in range (d):
    for t in range (i,d):
        indices_s_deg.append([t, i])
indices_s_deg = np.array(indices_s_deg).T

# Creating indices for subsets of degree 3
max_t_degree = 15
for i in range (max_t_degree):
    for t in range (i,max_t_degree):
        for j in range(t,max_t_degree):
            if not (i == t and i == j):
                indices_t_deg.append([j, t, i])
indices_t_deg = np.array(indices_t_deg).T

degrees = range(3,11)
degrees_number = len(degrees) + 1
stdX_Ncols = standardized_testx.shape[1]
indices_s_Ncols = indices_s_deg.shape[1]
indices_t_Ncols = indices_t_deg.shape[1]

number_of_rows = indices_s_Ncols + degrees_number * stdX_Ncols + indices_t_Ncols

mat = np.zeros((n, number_of_rows))

print("Computing first degree...")
# First degree
mat[:, :stdX_Ncols] = standardized_testx

print("Computing second degree with combinations...")
# Second degree gotten from indices
mat[:,stdX_Ncols:stdX_Ncols + indices_s_Ncols] = standardized_testx[:, indices_s_deg[0]] * standardized_testx[:, indices_s_deg[1]]

print("Computing from degree 3 to 10 without combinations...")
# Improve 3 to 10 degree
for i in degrees:
    start_index = indices_s_Ncols + (i - 2) * stdX_Ncols
    end_index = start_index + stdX_Ncols
    mat[:,start_index:end_index] = standardized_testx**i
    
print("Computing third degree with some combinations...")
# Third degree gotten from indices
mat[:, number_of_rows - indices_t_Ncols: number_of_rows] = standardized_testx[:, indices_t_deg[0]] * standardized_testx[:, indices_t_deg[1]] * standardized_testx[:, indices_t_deg[2]]  
        
centered_mat = mat - m2
centered_mat[mat==0] = 0

print("Freeing the matrix memory again")
del(mat)

#centered_mat[centered_mat==0]=float('nan')
#stdev=np.nanstd(centered_mat,axis=0);
#centered_mat[centered_mat==float('nan')]=0
standardized_testmat = centered_mat / stdev

#tao=int(d*(d+1)/2+d)
    
#for i in range (n):
#    for r in range (d):
#        for monsoon in range (r,d):
#            for t in range (monsoon,d): 
#                mat[i,tao]=standardized_tX[i,t]*standardized_tX[i,monsoon]*standardized_tX[i,r]
#                tao=tao+1
#    tao=int(d*(d+1)/2+d)  
    
num_samples = len(standardized_testmat)
final_testx = np.c_[np.ones(num_samples), standardized_testmat]
tX_test = final_testx

In [None]:
OUTPUT_PATH = '/Users/alperkose/Desktop/deneme8.csv' # TODO: fill in desired name of output file for submission
y_pred = -predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
aq=np.dot(testx,reg_logistic_w)  
y_guess=np.sign(aq)
trueness=sum((y_guess==testy))/len(testy)
print(trueness)