# Logistic Regression Assignment
Implement the Logistic Regression learning by gradient ascent as described in class.
Before using logistic regression, be sure to normalize the variables of the training set
to have zero mean and standard deviation 1, and to do the exact same transformation to
the test set, using the mean and standard deviation of the training set

#### Import dependencies

In [62]:
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

#### Load data

In [45]:
# TODO

#### Utility functions

In [46]:
def get_gisette():
    path = "data/gisette/"   
    
    train_x = np.loadtxt(path+"gisette_train.data")
    train_y = np.loadtxt(path+"gisette_train.labels")
    
    valid_x = np.loadtxt(path+"gisette_valid.data")
    valid_y = np.loadtxt(path+"gisette_valid.labels")
    
    test_x = np.loadtxt(path+"gisette_test.data")

    return train_x, train_y, valid_x, valid_y, test_x


def get_hill_valley():
    path = "data/hill-valley/"   
    
    train_x = np.loadtxt(path+"X.dat")
    train_y = np.loadtxt(path+"Y.dat")
    
    test_x = np.loadtxt(path+"Xtest.dat")
    test_y = np.loadtxt(path+"Ytest.dat")
    
    return train_x, train_y, test_x, test_y


def get_dexter():
    path = "data/dexter/"   
    
    def parse_X(file_path):
        def gen():
            with open(file_path) as f:
                for line in f:
                    dict = {}
                    for pair in line.split(" "):
                        if ":" in pair:
                            key, value = pair.split(":")
                            dict[key] = value
                    yield dict
        df = pd.DataFrame(gen())
        df.fillna(0, inplace=True)
        return csr_matrix(df.values.astype("int"))
    
    train_x = parse_X(path+"dexter_train.data")
    train_y = np.loadtxt(path+"dexter_train.labels")
    
    valid_x = parse_X(path+"dexter_valid.data")
    valid_y = np.loadtxt(path+"dexter_valid.labels")
    
    test_x = parse_X(path+"dexter_test.data")
    
    return train_x, train_y, valid_x, valid_y, test_x

In [47]:
def normalize(train, *args):
    mean = np.average(train, axis=0)
    standard_deviation = np.std(train, axis=0)
    columns = train, *args
    return tuple(np.divide(column-mean, standard_deviation, where=standard_deviation!=0)
                 for column in columns)

In [66]:
def logistic_regression(x, train_y, test_x, test_y):
    x = np.hstack((np.ones((x.shape[0], 1)), x))
    #iterations = 300
    iterations = 100
    lmb = 0.0001
    learning_rate = .01
    w = np.zeros(x.shape[1])
    scores = []
    for _ in range(iterations):
        w = w - learning_rate * lmb * w - learning_rate / x.shape[0] * log_likelihood_derivative(w, x, train_y)
        predicted_y = np.round(predict(x, w))
        scores.append(accuracy_score(train_y, predicted_y))
    plt.plot(range(1,iterations + 1), scores)
    plt.show()

def log_likelihood_derivative(w, x, y):
    y_hat = x.dot(w)
    result = ((y - np.exp(y_hat)) / (1 + np.exp(y_hat)))
    result = np.sum(x.T*result, axis=1)
    return result

def predict(x, w):
    y = 1/(1 + np.exp(x.dot(w)))
    return y

#logistic_regression(train_x, train_y, valid_x, valid_y)




ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Gisette

In [54]:
train_x, train_y, valid_x, valid_y, test_x = get_gisette()
train_x, valid_x, test_x = normalize(train_x, valid_x, test_x)
train_y[train_y==-1] = 0

In [50]:
logistic_regression(train_x, train_y, valid_x, valid_y)


[ 5.00000000e-03  1.51994153e-05  1.86458532e-05 ...  4.88508565e-04
 -2.52076466e-07  1.03010236e-03]
[ 9.34335334e-03  4.39183292e-05  4.99099366e-05 ...  1.07238783e-03
 -3.35037101e-06  2.26041294e-03]
[ 1.29648766e-02  8.69039185e-05  9.63795628e-05 ...  1.76319022e-03
 -1.13986163e-05  3.70659623e-03]
[ 1.59023179e-02  1.41067166e-04  1.57649405e-04 ...  2.55664359e-03
 -2.37687093e-05  5.34629826e-03]
[ 1.82972080e-02  1.99271725e-04  2.30143339e-04 ...  3.43354822e-03
 -3.63071568e-05  7.12689812e-03]
[ 2.03210501e-02  2.54669465e-04  3.09829056e-04 ...  4.37058517e-03
 -4.43889703e-05  8.99513501e-03]
[ 2.21118908e-02  3.04073348e-04  3.94002384e-04 ...  5.34843113e-03
 -4.57098038e-05  1.09136899e-02]
[ 2.37604576e-02  3.47229631e-04  4.81088267e-04 ...  6.35355804e-03
 -4.01122633e-05  1.28606312e-02]
[ 2.53213856e-02  3.85014515e-04  5.70078621e-04 ...  7.37709888e-03
 -2.83764982e-05  1.48239282e-02]
[ 2.68268348e-02  4.18429933e-04  6.60258889e-04 ...  8.41335263e-03
 -1.

