In [1]:
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [2]:
data_dir = 'data558spring2020'

# Load the data
x_train = np.load(os.path.join(data_dir, 'train_features.npy'))
y_train = np.load(os.path.join(data_dir, 'train_labels.npy'))
x_valid = np.load(os.path.join(data_dir, 'val_features.npy'))
y_valid = np.load(os.path.join(data_dir, 'val_labels.npy'))
x_test = np.load(os.path.join(data_dir, 'test_features.npy'))
# y_test = np.load(os.path.join(data_dir, 'test_labels.npy'))


FileNotFoundError: [Errno 2] No such file or directory: 'data558spring2020/train_features.npy'

In [3]:
x_train = x_train1[y_train1 < 2]
y_train = y_train1[y_train1 < 2]
x_valid = x_valid1[y_valid1 < 2]
y_valid = y_valid1[y_valid1 < 2]


# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.fit_transform(x_valid)
x_test = scaler.fit_transform(x_test1)


In [4]:
# Keep track of the number of samples and dimension of each sample 
n_train = len(y_train) 
n_test = x_test1.shape[0] 
d = np.size(x_train, 1)


In [5]:
lr_cv = LogisticRegressionCV(penalty='l2', fit_intercept=False, tol=10e-8, max_iter=1000, cv=5) 
lr_cv.fit(x_train, y_train) 
optimal_lambda = 1/(2*lr_cv.C_[0]*len(x_train)) 
print('Optimal C=', lr_cv.C_[0]) 
print('Optimal lambda=', optimal_lambda)

Optimal C= 0.005994842503189409
Optimal lambda= 0.08340502686000296


In [6]:
# hyperparameters to feed into the functions
lamda = optimal_lambda
L = (np.max(np.linalg.eig(1/len(y_train)*x_train.T@x_train)[0])) + lamda
eta_0 = 1/L

In [7]:
def Fx_logisticReg(b, lamda=0.5,X=x_train,y=y_train):
    h = X @ b * y
    return (1/len(y)) * sum(np.log(1 + np.exp(-h))) + (lamda * np.linalg.norm(b)**2)

In [8]:
def computegrad(b, lamda,X=x_train,y=y_train):
    h = X @ b * y 
    return 2*lamda*b - (1/len(y) * (X.T @ (y / (np.exp(h) + 1))))

In [9]:
def backtracking(beta, lamda, X, y, eta=1, alpha=0.5, betaparam=0.8, maxiter=100): 
    grad_beta = computegrad(b=beta, lamda=lamda, X=X, y=y) 
    norm_grad_beta = np.linalg.norm(grad_beta) 
    found_eta = 0 
    num_iters = 0 
    while found_eta == 0 and num_iters < maxiter: 
        if Fx_logisticReg(beta - eta * grad_beta,lamda=lamda,X=X,y=y) <  Fx_logisticReg(beta, lamda=lamda, X=X, y=y) - alpha * eta * norm_grad_beta ** 2: 
            found_eta = 1 
        elif num_iters == maxiter: 
            raise ('Max number of iterations of backtracking line search reached') 
        else: 
            eta *= betaparam 
            num_iters += 1 
        
    return eta

In [10]:
def myclassifier(eta, epsilon,lamda, X=x_train, y=y_train):
    """
    Function for the fast gradient descent for the exponential loss algorithm
    """
    eta_l = eta
    beta = np.zeros(X.shape[1])
    theta = np.zeros(X.shape[1])
    beta_vals = beta
    
    grad_theta = computegrad(theta,lamda=lamda,X=X,y=y) 
    grad_beta = computegrad(beta,lamda=lamda,X=X,y=y)
    
    num_iters = 0
    while np.linalg.norm(grad_beta) > epsilon:
        eta_l = backtracking(beta=beta, lamda = lamda, X=X, y=y, eta=eta_l, alpha=0.5, betaparam=0.8, maxiter=100)
        beta_new = theta - eta_l * grad_theta
        theta = beta_new + (num_iters/(num_iters+3)) * (beta_new - beta)
        
        beta_vals = np.vstack((beta_vals, beta))
        
        grad_theta = computegrad(theta,lamda=lamda,X=X,y=y)
        grad_beta = computegrad(beta,lamda=lamda,X=X,y=y)
        
        beta = beta_new
        num_iters += 1
        
        
    return beta_vals

In [11]:
def y_predicted(X,b, threshold=0.5):
    """
    Calculates Predicted y_hat values for the given X predictor variables and b weights
    """
    y_hat = (X @ b)
    y_pred = np.piecewise(y_hat,[y_hat >= threshold, y_hat <threshold],[lambda x: 1, lambda x: 0])
    
    return y_pred

In [12]:
def compute_misclassification_error(b, X, y): 
    """
    Computes the missclassification error of the SVM for the calculated weights from the fast gradient descent
    
    Parameters:
    -------------
    y: response variables in the form of numpy.array
    X: numpy.array of preditor variables 
    b: numpy.arry of the weights
    
    Returns:
    -------------
    The missclassification value of y_pred != y_actual
    """
    y_pred = y_predicted(X=X, b=b)
    return np.mean(y_pred != y)

In [14]:
def sensitivity_specificity(X, y_actual, b, threshold=0.5):
    """
    Calculates Sensitivity and Specificity for the predicted y_hat values from
    the provided X predictor variables and b weights compared to the y_actual
    """
    yhat = y_predicted(X=X,b=b, threshold=threshold)
    
    TP = np.sum(yhat[yhat == 1] == y_actual[yhat==1])
    TN = np.sum(yhat[yhat == 0] == y_actual[yhat==0])
    FP = np.sum(yhat[yhat == 1] != y_actual[yhat==1])
    FN = np.sum(yhat[yhat == 0] != y_actual[yhat==0])

    sensitivity = TP/(TP+FN) 
    specificity = TN/(TN+FP)
    
    return [sensitivity, specificity]

In [15]:
def print_misclassification_err(b, names=('training', 'validation'), xdata=(x_train, x_valid), ydata=(y_train, y_valid)):
    for (name, x, y) in zip(names, xdata, ydata): 
        error = compute_misclassification_error(b, X=x, y=y) 
        sensitivity, specificity = sensitivity_specificity(X=x, y_actual=y, b=b)
        print(name + ' data')
        print('Misclassification error: %0.3f'% (100*error)) 
        print('Sensitivity of: %0.3f %%'% (100*sensitivity))
        print('Specificity of: %0.3f \n'% (100*specificity))

In [16]:
# getting the betas for the training set
betas_tr = myclassifier(eta=eta_0, epsilon=5.1**-3, lamda=lamda)

# printing values for classification error, sensitivity, and specificity
print_misclassification_err(betas_tr[-1])


training data
Misclassification error: 3.100
Sensitivity of: 100.000 %
Specificity of: 93.800 

validation data
Misclassification error: 5.500
Sensitivity of: 98.000 %
Specificity of: 91.000 



In [17]:
predicted1 = y_predicted(x_test, betas_tr[-1])

In [18]:
submission = pd.DataFrame(np.real(predicted1), columns=['Category'])
submission.index.name='Id'

In [None]:
submission.to_csv('submission1.csv')