In [1]:
"""
ML lab No2
Homework on Machine Learning Technologies.
Applying the logistic regression method using different optimize functions.
"""

"\nML lab No1\nHomework on Machine Learning Technologies.\nImplementing of stochastic gradient descent\nand Adam's optimization algorithms using numpy library.\n"

Task:
1.	Apply the logistic regression method using the functions in the notebook «Logistic Regression as a Neural Network – BP alg.ipynb” to predict the biological response of a molecule. Use 75% of the sample to train the model, and the rest of the data to estimate its accuracy.
2.	Modify optimize() function to implement the stochastic gradient descent (SGD) method. Apply it to solve the problem from p.1.
3.	For two modifications of gradient descent (pp. 1 and 2), plot the learning curves (dependence of the value of the loss function on the iteration number), apply models with different values of the learning rate (at least 5 different learning rates). How does it affect the accuracy of the model?
4.	*(not nesessary) Implement the Adam optimization method using the numpy library and compare the accuracy of the model trained with it with the models trained by the classic GD and SGD algorithms.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# logistic regression method functions copied from
# JNB 'Logistic Regression as a Neural Network - BP alg'):

def sigmoid(z):
    """
    Compute the sigmoid of z

    Arguments:
    z -- A scalar or numpy array of any size.

    Return:
    s -- sigmoid(z)
    """
    s = 1./(1.+np.exp(-z))
    
    return s

def initialize_with_zeros(dim):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.
    
    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)
    
    Returns:
    w -- initialized vector of shape (dim, 1)
    b -- initialized scalar (corresponds to the bias)
    """
    w = np.zeros((dim,1))
    b = 0.
    
    return w, b

def propagate(w, b, X, Y):
    """
    Implement the cost function and its gradient for the propagation explained above

    Arguments:
    w -- weights, a numpy array of size which equals the number of features
    b -- bias, a scalar
    X -- data 
    Y -- true "label" vector (containing 0 and 1) of size (1, number of examples)

    Return:
    cost -- negative log-likelihood cost for logistic regression
    dw -- gradient of the loss with respect to w, thus same shape as w
    db -- gradient of the loss with respect to b, thus same shape as b
    
    """
    m = X.shape[1]
    #print('number of objects = ',len(X))
    
    # FORWARD PROPAGATION (FROM X TO COST)
    A = sigmoid(np.dot(w.T,X)+b )                                 # compute activation
    cost = -(1./m)*np.sum(Y*np.log(A)+(1-Y)*np.log(1-A),axis=1)   # compute cost
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    dw = (1./m)*np.dot(X,(A-Y).T)
    db = (1./m)*np.sum(A-Y,axis=1)
#     if np.sum(w) == 0:
#         print('A shape: ', A.shape)
#         print('X shape: ', X.shape)
#         print('Y shape: ', Y.shape)
#         print('dw shape: ', dw.shape)
#         print('db shape: ', db.shape)

    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    """
    This function optimizes w and b by running a gradient descent algorithm
    
    Arguments:
    w -- weights, a numpy array 
    b -- bias, a scalar
    X -- data 
    Y -- true "label" vector (containing 0 and 1), of shape (1, number of examples)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- True to print the loss every 100 steps
    
    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.
    
    """
    
    costs = []
    
    for i in range(num_iterations):
                
        # Cost and gradient calculation 
        grads, cost = propagate(w,b,X,Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        w -=learning_rate*dw
        b -=learning_rate*db
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

def predict(w, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    
    Arguments:
    w -- weights, a numpy array 
    b -- bias, a scalar
    X -- data 
    
    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''
    
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities 
    A = sigmoid(np.dot(w.T,X)+b)
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if (A[0,i]<=0.5):
            Y_prediction[0][i]=0
        else:
            Y_prediction[0][i]=1
    
    return Y_prediction

def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000,
          learning_rate = 0.5, print_cost = False, optimize = optimize):
    """
    Builds the logistic regression model by calling the function we've implemented previously
    
    Arguments:
    X_train -- training set represented by a numpy array 
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array 
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to true to print the cost every 100 iterations
    
    Returns:
    d -- dictionary containing information about the model.
    """
    
    # initialize parameters with zeros 
    w, b = initialize_with_zeros(X_train.shape[0])

    # Gradient descent
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]
    
    # Predict test/train set examples
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [3]:
dir = "D:/Work/Data_files/working_dir/"
file = "bioresponse.csv"

df = pd.read_csv(dir + file)
df.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# create train/test splitted samples of required shape:

y = df['Activity']
X = df.drop('Activity', axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25)

X_train = np.array(X_train.T)
X_test = np.array(X_test.T)
y_train = np.array([y_train])
y_test = np.array([y_test])

In [12]:
# 1. Logistic regression method using the functions
# in the notebook «Logistic Regression...”

d = model(X_train, y_train, X_test, y_test, num_iterations = 2000, learning_rate = 0.001, print_cost = True)

Cost after iteration 0: 0.693147
Cost after iteration 100: 0.680294
Cost after iteration 200: 0.673197
Cost after iteration 300: 0.666863
Cost after iteration 400: 0.660968
Cost after iteration 500: 0.655450
Cost after iteration 600: 0.650269
Cost after iteration 700: 0.645391
Cost after iteration 800: 0.640787
Cost after iteration 900: 0.636431
Cost after iteration 1000: 0.632302
Cost after iteration 1100: 0.628379
Cost after iteration 1200: 0.624646
Cost after iteration 1300: 0.621086
Cost after iteration 1400: 0.617688
Cost after iteration 1500: 0.614437
Cost after iteration 1600: 0.611324
Cost after iteration 1700: 0.608337
Cost after iteration 1800: 0.605469
Cost after iteration 1900: 0.602711
train accuracy: 72.69818698897973 %
test accuracy: 71.53518123667376 %


In [8]:
# 2. Modify optimize() function to implement
# the stochastic gradient descent (SGD) method

def propagate_SGD(w, b, X, Y, iteration):
    """
    Implement the cost function and its partial gradient

    Arguments:
    w -- weights, a numpy array of size which equals the number of features
    b -- bias, a scalar
    X -- data 
    Y -- true "label" vector (containing 0 and 1) of size (1, number of examples)

    Return:
    cost -- negative log-likelihood cost for logistic regression
    dw -- partial gradient of the loss with respect to w, thus same shape as w
    db -- partial gradient of the loss with respect to b, thus same shape as b
    
    """
    m = X.shape[1]
    
    # FORWARD PROPAGATION (FROM X TO COST)
    if iteration ==  0:
        # compute activation on full batch:
        A = sigmoid(np.dot(w.T, X) + b)
    else:
        # change previously computed activation:
        A = sigmoid(np.dot(w.T, X) + b)
    cost = -(1./m) * np.sum(Y * np.log(A) +
                            (1-Y) * np.log(1-A), axis=1)
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    dw = (1./m)*np.dot(X[iteration%len(w)],(A-Y).T)
    db = (1./m)*np.sum(A-Y,axis=1)

#    if np.sum(w) == 0:
#         print('A shape: ', A.shape)
#         print('A: ', A)
#         print('X shape: ', X.shape)
#         print('Y shape: ', Y.shape)
#         print('Y: ', Y)
#         print('dw shape: ', dw.shape)
#         print('dw: ', dw)
#         print('db shape: ', db.shape)
#         print('db: ', db)
#         print('w[iteration%len(w)][0]: ', w[iteration%len(w)][0])
#         print('len(X[iteration]): ', len(X[iteration]))

    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

def optimize_SGD(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    """
    This function optimizes w and b by running a
    stochastic gradient descent algorithm
    
    Arguments:
    w -- weights, a numpy array 
    b -- bias, a scalar
    X -- data 
    Y -- true "label" vector (containing 0 and 1), of shape (1, number of examples)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- True to print the loss every 100 steps
    
    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.
    
    """
    
    costs = []
    
    for i in range(num_iterations):
                
        # Cost and gradient calculation 
        grads, cost = propagate_SGD(w, b, X, Y, i)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        w[i%len(w)][0] -= learning_rate * dw
        b -= learning_rate * db
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 'step' training iterations:
        step = 10000
        if print_cost and i % step == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs


In [9]:
SGD_mod = model(X_train, y_train, X_test, y_test, num_iterations = 100000,
                learning_rate = 0.01, print_cost = True, optimize = optimize_SGD)

Cost after iteration 0: 0.693147
Cost after iteration 2000: 0.687737
Cost after iteration 4000: 0.686731
Cost after iteration 6000: 0.685735
Cost after iteration 8000: 0.684768
Cost after iteration 10000: 0.683406
Cost after iteration 12000: 0.682291
Cost after iteration 14000: 0.681344
Cost after iteration 16000: 0.680450
Cost after iteration 18000: 0.679386
Cost after iteration 20000: 0.678534
Cost after iteration 22000: 0.677686
Cost after iteration 24000: 0.676860
Cost after iteration 26000: 0.675674
Cost after iteration 28000: 0.674743
Cost after iteration 30000: 0.673932
Cost after iteration 32000: 0.673030
Cost after iteration 34000: 0.672214
Cost after iteration 36000: 0.671467
Cost after iteration 38000: 0.670726
Cost after iteration 40000: 0.670001
Cost after iteration 42000: 0.668943
Cost after iteration 44000: 0.668137
Cost after iteration 46000: 0.667424
Cost after iteration 48000: 0.666612
Cost after iteration 50000: 0.665883
Cost after iteration 52000: 0.665214
Cost afte