In [1]:
# Import Modules/Libraries
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

%matplotlib inline
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

np.random.seed(1)

import warnings
#suppress warnings with numpy for sigmoid function
warnings.filterwarnings('ignore')

# Logistic Regression

In [2]:
# Load datasets
train = pd.read_csv('../datasets/clean/cleaned_train.csv', index_col = False)
test = pd.read_csv('../datasets/clean/cleaned_test.csv', index_col = False)

In [40]:
train.head(3)

Unnamed: 0,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Charms,Flying,Gryffindor,Hufflepuff,Ravenclaw,Slytherin
0,-1.014194,0.878628,1.010346,0.377371,1.021139,0.345639,0.512444,0.219633,1.204553,-0.50033,0,0,1,0
1,-1.137535,-1.36569,1.133455,-2.109573,-0.540256,-1.204191,0.258503,0.653769,-1.002983,-1.386928,0,0,0,1
2,-0.780078,1.261379,0.776671,0.718622,1.828915,1.005195,0.133871,1.314249,1.825184,0.086673,0,0,1,0


In [41]:
test.head(3)

Unnamed: 0,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Charms,Flying
0,1.26139,0.360013,-1.260955,1.16528,-0.289017,-0.96542,0.281215,0.377241,-0.126065,-0.364411
1,-0.789243,0.349486,0.785819,0.768919,1.536298,1.007714,0.790127,0.406955,1.375558,-0.493877
2,0.539127,-1.403671,-0.540051,0.352583,-0.833736,0.868643,-1.927101,-2.122591,-1.095106,1.825147


In [42]:
#Split X (independent variables) with the target value Y
target_columns = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
X_train = np.array(train.drop(columns = target_columns))
y_train = np.array(train[target_columns])

In [84]:
print ('The shape of X_train is: ' + str(X_train.shape))
print ('The shape of y_train is: ' + str(y_train.shape))
print ('We have m = %d training examples' % (len(y_train)))

The shape of X_train is: (1600, 10)
The shape of y_train is: (1600, 4)
We have m = 1600 training examples


In [85]:
X_train[0, :]

array([-1.01419368,  0.87862839,  1.01034582,  0.37737107,  1.02113913,
        0.34563861,  0.51244445,  0.21963306,  1.20455291, -0.50032993])

In [86]:
# To Access the ith column 
y_train[:, 0]

array([0, 0, 0, ..., 1, 0, 0])

### Sigmoid function

For logistic regression, the model is represented as

$$ f_{\mathbf{w},b}(x) = g(\mathbf{w}\cdot \mathbf{x} + b)$$
where function $g$ is the sigmoid function. The sigmoid function is defined as:

$$g(z) = \frac{1}{1+e^{-z}}$$

In [87]:
def sigmoid(z):
    """
    Compute the sigmoid of z
    Args:
        z (ndarray): A scalar, numpy array of any size.
    Returns:
        g (ndarray): sigmoid(z), with the same shape as z
    """
    return 1 / (1 + np.exp(-z))

In [88]:
print ("sigmoid(0) = " + str(sigmoid(0)))

sigmoid(0) = 0.5


### Cost function for logistic regression


Recall that for logistic regression, the cost function is of the form 

$$ J(\mathbf{w},b) = \frac{1}{m}\sum_{i=0}^{m-1} \left[ loss(f_{\mathbf{w},b}(\mathbf{x}^{(i)}), y^{(i)}) \right] \tag{1}$$

where
* m is the number of training examples in the dataset


* $loss(f_{\mathbf{w},b}(\mathbf{x}^{(i)}), y^{(i)})$ is the cost for a single data point, which is - 

    $$loss(f_{\mathbf{w},b}(\mathbf{x}^{(i)}), y^{(i)}) = (-y^{(i)} \log\left(f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) - \left( 1 - y^{(i)}\right) \log \left( 1 - f_{\mathbf{w},b}\left( \mathbf{x}^{(i)} \right) \right) \tag{2}$$
    
    
*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)})$ is the model's prediction, while $y^{(i)}$, which is the actual label

*  $f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = g(\mathbf{w} \cdot \mathbf{x^{(i)}} + b)$ where function $g$ is the sigmoid function.
    * It might be helpful to first calculate an intermediate variable $z_{\mathbf{w},b}(\mathbf{x}^{(i)}) = \mathbf{w} \cdot \mathbf{x^{(i)}} + b = w_0x^{(i)}_0 + ... + w_{n-1}x^{(i)}_{n-1} + b$ where $n$ is the number of features, before calculating $f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = g(z_{\mathbf{w},b}(\mathbf{x}^{(i)}))$

In [141]:
def compute_cost(X, y, w, b, lambda_= 1):
    """
    Computes the cost over all examples
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (array_like Shape (m,)) target value 
      w : (array_like Shape (n,)) Values of parameters of the model      
      b : scalar Values of bias parameter of the model
      lambda_: unused placeholder
    Returns:
      total_cost: (scalar)         cost 
    """

    m, n = X.shape
    
    cost = 0
    for i in range(m):
        z = np.dot(X[i],w) + b
        f_wb = sigmoid(z)
        cost += -y[i]*np.log(f_wb) - (1-y[i])*np.log(1-f_wb)
    total_cost = cost/m
    

    return total_cost

In [142]:
(1-y_train[:, 2])

array([0, 1, 0, ..., 1, 1, 1])

In [143]:
m, n = X_train.shape

# Compute and display cost with w initialized to zeroes
initial_w = np.zeros(n)
initial_b = 0.
cost = compute_cost(X_train, y_train[:, 0], initial_w, initial_b)
print('Cost at initial w (zeros): {:.3f}'.format(cost))

Cost at initial w (zeros): 0.693


###  Gradient for logistic regression

In this section, you will implement the gradient for logistic regression.

Recall that the gradient descent algorithm is:

$$\begin{align*}& \text{repeat until convergence:} \; \lbrace \newline \; & b := b -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial b} \newline       \; & w_j := w_j -  \alpha \frac{\partial J(\mathbf{w},b)}{\partial w_j} \tag{1}  \; & \text{for j := 0..n-1}\newline & \rbrace\end{align*}$$

where, parameters $b$, $w_j$ are all updated simultaniously

compute $\frac{\partial J(\mathbf{w},b)}{\partial w}$, $\frac{\partial J(\mathbf{w},b)}{\partial b}$ from equations (2) and (3) below.

$$
\frac{\partial J(\mathbf{w},b)}{\partial b}  = \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - \mathbf{y}^{(i)}) \tag{2}
$$
$$
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  = \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - \mathbf{y}^{(i)})x_{j}^{(i)} \tag{3}
$$

In [99]:
def compute_gradient(X, y, w, b, lambda_=None): 
    """
    Computes the gradient for logistic regression 
 
    Args:
      X : (ndarray Shape (m,n)) variable such as house size 
      y : (array_like Shape (m,1)) actual value 
      w : (array_like Shape (n,1)) values of parameters of the model      
      b : (scalar)                 value of parameter of the model 
      lambda_: unused placeholder.
    Returns
      dj_dw: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w. 
      dj_db: (scalar)                The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape
    dj_dw = np.zeros(w.shape)
    dj_db = 0.

    z = np.dot(X, w) + b
    f_wb = sigmoid(z)
    
    for j in range(n):
        dj_dw[j] = (np.sum(np.dot(f_wb - y, X.T[j])))
            
    dj_dw = dj_dw / m
    dj_db = np.sum(f_wb - y) / m
        
    return dj_db, dj_dw

### Learning parameters using gradient descent 


In [100]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X :    (array_like Shape (m, n)
      y :    (array_like Shape (m,))
      w_in : (array_like Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)                 Initial value of parameter of the model
      cost_function:                  function to compute cost
      alpha : (float)                 Learning rate
      num_iters : (int)               number of iterations to run gradient descent
      lambda_ (scalar, float)         regularization constant
      
    Returns:
      w : (array_like Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """
    
    # number of training examples
    m = len(X)
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        # Update Parameters using w, b, alpha and gradient
        w_in = w_in - alpha * dj_dw
        b_in = b_in - alpha * dj_db
       
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in, J_history, w_history #return w and J,w history for graphing

## Fit 

In [102]:
intial_w = 0.01 * (np.random.rand(n).reshape(-1,1) - 0.5)
initial_b = -8


# Some gradient descent settings
iterations = 10000
alpha = 0.01

w,b, J_history,_ = gradient_descent(X_train ,y_train[:, 3], initial_w, initial_b, 
                                   compute_cost, compute_gradient, alpha, iterations, 0)

Iteration    0: Cost     1.50   
Iteration 1000: Cost     0.09   
Iteration 2000: Cost     0.07   
Iteration 3000: Cost     0.06   
Iteration 4000: Cost     0.06   
Iteration 5000: Cost     0.06   
Iteration 6000: Cost     0.06   
Iteration 7000: Cost     0.06   
Iteration 8000: Cost     0.06   
Iteration 9000: Cost     0.06   
Iteration 9999: Cost     0.05   


## Predict

In [55]:
def predict(X, w, b): 
    """
    Predict whether the label is 0 or 1 using learned logistic
    regression parameters w
    
    Args:
    X : (ndarray Shape (m, n))
    w : (array_like Shape (n,))      Parameters of the model
    b : (scalar, float)              Parameter of the model

    Returns:
    p: (ndarray (m,1))
        The predictions for X using a threshold at 0.5
    """
    # number of training examples
    m, n = X.shape
    p = np.zeros(m)

    f_wb = sigmoid(np.dot(X, w) + b)
    for i, prob in enumerate(f_wb):
        p[i] = 1 if prob >= 0.50 else 0
    return p

In [56]:
#Compute accuracy on our training set
p = predict(X_train, w,b)
print('Train Accuracy: %f'%(np.mean(p == y_train[:, 3]) * 100))

Train Accuracy: 96.062500


----------

# Make a class out of it...

In [179]:
class LogisticRegression:
    """
        A class to perform Logistic Regression
    """
    def __init__(self):
        """
            Define attributes which will be passed later
        """
        # train data and label
        self.X = None
        self.y = None
        
        # m: number of observations
        self.m = None
        
        # n: number of independent variables (X)
        self.n = None
        
        # store historic value of cost function. init as +infinity
        self.costs = [np.inf]

        self.J_history = []
        self.w_history = []
        
        # parameteres (weights)
        self.w = None
        self.b = None
        
        # regularization constant lambda_ (scalar, float)
        self.lambda_ = None

        
    def sigmoid(self, z):
        """
        Compute the sigmoid of z
        Args:
            z (ndarray): A scalar, numpy array of any size.
        Returns:
            g (ndarray): sigmoid(z), with the same shape as z
        """
        return 1 / (1 + np.exp(-z))

    
    def compute_cost(self, X, y, w, b):
        """
        Computes the cost over all examples
        Args:
          X : (ndarray Shape (m,n)) data, m examples by n features
          y : (array_like Shape (m,)) target value 
          w : (array_like Shape (n,)) Values of parameters of the model      
          b : scalar Values of bias parameter of the model
          lambda_: unused placeholder
        Returns:
          total_cost: (scalar)         cost 
        """

        m, n = X.shape

        z = np.dot(X, w) + b
        pred = self.sigmoid(z)

        pred[pred == 1] = 1-1e-9 #hard cap max threshold
        
        
        cost = np.dot(-y, np.log(pred)) - (np.dot(1 - y, np.log(1 - pred)))
        total_cost = np.sum(cost) / m
    
        reg_cost = np.sum(np.dot(w, w.T))
        total_cost = total_cost + (self.lambda_/(2 * m)) * reg_cost
        
        return total_cost

    
    def compute_gradient(self, X, y, w, b, lambda_): 
        """
        Computes the gradient for logistic regression 

        Args:
          X : (ndarray Shape (m,n)) variable such as house size 
          y : (array_like Shape (m,1)) actual value 
          w : (array_like Shape (n,1)) values of parameters of the model      
          b : (scalar)                 value of parameter of the model 
          lambda_: unused placeholder.
        Returns
          dj_dw: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w. 
          dj_db: (scalar)                The gradient of the cost w.r.t. the parameter b. 
        """
        m, n = X.shape
        dj_dw = np.zeros(w.shape)
        dj_db = 0.0

        z = np.dot(X, w) + b
        f_wb = self.sigmoid(z)

        for j in range(n):
            dj_dw[j] = (np.sum(np.dot(f_wb - y, X.T[j])))

        dj_dw = dj_dw / m
        dj_dw += np.dot((lambda_ / m), w) # add regularization
        dj_db = np.sum(f_wb - y) / m

        return dj_db, dj_dw


    def gradient_descent(self, cost_function, gradient_function, alpha, num_iters, show_every): 
        """
        Performs batch gradient descent to learn theta. Updates theta by taking 
        num_iters gradient steps with learning rate alpha

        Args:
          X :    (array_like Shape (m, n)
          y :    (array_like Shape (m,))
          w_in : (array_like Shape (n,))  Initial values of parameters of the model
          b_in : (scalar)                 Initial value of parameter of the model
          cost_function:                  function to compute cost
          alpha : (float)                 Learning rate
          num_iters : (int)               number of iterations to run gradient descent
          lambda_ (scalar, float)         regularization constant

        Returns:
          w : (array_like Shape (n,)) Updated values of parameters of the model after
              running gradient descent
          b : (scalar)                Updated value of parameter of the model after
              running gradient descent
        """
        
        for i in range(num_iters):

            # Calculate the gradient and update the parameters
            dj_db, dj_dw = gradient_function(self.X, self.y, self.w, self.b, self.lambda_)   

            # Update Parameters using w, b, alpha and gradient
            self.w = self.w - alpha * dj_dw               
            self.b = self.b - alpha * dj_db              

            # Save cost J at each iteration
            if i<100000:      # prevent resource exhaustion 
                cost =  cost_function(self.X, self.y, self.w, self.b)

            # Print cost every at intervals 10 times or as many iterations if < 10
            if i % show_every == 0 or i == num_iters-1:
                self.J_history.append(cost)
#                 self.w_history.append(self.w)
                print(f"Iteration {i:4}: Cost {float(self.J_history[-1]):8.2f}   ")
    
    
    def fit(self, X, y, alpha=0.001, iterations=1500, show_every=None, lambda_= 1):
        """
        setup attributes and apply training
        """
        
        # train data and label
        self.X = X
        self.y = y
        
        # m: number of observations
        self.m, self.n = X.shape

        # regularization coefficient
#         self.reg = lambda_
        self.lambda_ = lambda_
        
        # init weights if first call
        if type(self.w) is not np.ndarray:
            self.w = 0.01 * (np.random.rand(n).reshape(-1,1) - 0.5)
        if self.b == None:
            self.b = -8
        if show_every == None:
            show_every = iterations // 10 
            
        # Perform Gradient Descent
        self.gradient_descent(self.compute_cost, self.compute_gradient, alpha, iterations, show_every)

    
    def predict(self, X): 
        """
        Predict whether the label is 0 or 1 using learned logistic
        regression parameters w

        Args:
        X : (ndarray Shape (m, n))
        w : (array_like Shape (n,))      Parameters of the model
        b : (scalar, float)              Parameter of the model

        Returns:
        p: (ndarray (m,1))
            The predictions for X using a threshold at 0.5
        """
        
        # Check if number of features matches our model
        
        
        # number of training examples
        m, n = X.shape
        p = np.zeros(m)

        f_wb = self.sigmoid(np.dot(X, self.w) + self.b)
        for i, prob in enumerate(f_wb):
            p[i] = 1 if prob >= 0.50 else 0
        return p

In [202]:
lr = LogisticRegression()

In [203]:
lr.fit(X_train, y_train[: , 0], alpha=0.00003, iterations=100, lambda_ = 1)

Iteration    0: Cost     1.61   
Iteration   10: Cost     1.30   
Iteration   20: Cost     1.00   
Iteration   30: Cost     0.71   
Iteration   40: Cost     0.44   
Iteration   50: Cost     0.24   
Iteration   60: Cost     0.13   
Iteration   70: Cost     0.09   
Iteration   80: Cost     0.08   
Iteration   90: Cost     0.08   
Iteration   99: Cost     0.08   


In [204]:
lr.fit(X_train, y_train[: , 0], alpha=0.000001, iterations=100, lambda_ = 1)

Iteration    0: Cost     0.08   
Iteration   10: Cost     0.08   
Iteration   20: Cost     0.08   
Iteration   30: Cost     0.08   
Iteration   40: Cost     0.08   
Iteration   50: Cost     0.08   
Iteration   60: Cost     0.08   
Iteration   70: Cost     0.08   
Iteration   80: Cost     0.08   
Iteration   90: Cost     0.08   
Iteration   99: Cost     0.08   


In [206]:
#Compute accuracy on our training set
p = lr.predict(X_train)
print('Train Accuracy: %f'%(np.mean(p == y_train[:, 0]) * 100))

Train Accuracy: 99.187500


#### Testing...

# Support Multiclass 

In [149]:
class MultipleLogisticRegression:

    def __init__(self):
    
        # C: number of categories for Y
        # -> create C models and train them each one at teh time
        self.c = None
        
        
        self.models = []
        
    def softmax():
        """
        takes the predicted values from the sub_models
        """