# Exercise 2 - Python: Logistic Regression

## *Part One*: Logistic regression without regularization

Beginning with package imports, data loading, and initial visualization

In [None]:
import numpy as np
import pandas as pd
import statsmodels.discrete.discrete_model as sm  # For comparing answers
from scipy import optimize  # Discovering optimal parameters
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

%matplotlib inline

In [None]:
path = 'C:/Users/JeffM/Documents/Projects/Machine Learning/machine-learning-ex2/ex2/'

df = pd.read_csv(path+'ex2data1.txt', header=None, names=['Exam1Score', 'Exam2Score', 'Admitted'])

# Inserting additional column for the intercept
df['x0'] = 1

X = df[['x0', 'Exam1Score', 'Exam2Score']]
y = df['Admitted']

# An array of 0s for starting values of theta to be used in many functions
initialTheta = np.zeros(3)

df.head()

In [None]:
# Plotting the data
sns.lmplot(x='Exam1Score', y='Exam2Score', hue='Admitted',
            data=df, fit_reg=False, markers=["x", "o"])
plt.xlabel('Exam 1 Score')
plt.ylabel('Exam 2 Score')

### Sigmoid Function

$g(z) = \frac{1}{1+e^{-z}}$

Converts $z$ into a value between 0 and 1

In [None]:
def sigmoid(z):
    return(1/(1+np.exp(-z)))

# Plotting values to validate the function is working correctly
plt.plot(np.arange(-10,10),
         sigmoid(np.arange(-10,10)))

### Logistic Regression Hypothesis

$h_\theta(x) = g(\theta^Tx)$

- Notation:

    - $g$: Sigmoid function

    - $\theta^T$: Transposed parameters
       
        - E.x.: $\theta^T = \begin{bmatrix} \theta_1 \\ \theta_2 \\ \vdots \\ \theta_n \end{bmatrix}$

In [None]:
def logistic_hypothesis(theta, x):
    """Calculates the hypothesis for X given values of
    theta for logistic regression"""
    return(sigmoid(x.dot(theta)))

logistic_hypothesis(theta=initialTheta, x=X.head())

### Cost Function

$J(\theta) = \frac{1}{m} \sum_{i=1}^m[-y^{(i)}log(h_\theta(x^{(i)})) - (1-y^{(i)})log(1-h_\theta(x^{(i)}))]$

- Notation:

    - $m$: Number of records

    - $h_\theta$: Logistic hypothesis $(h)$ given specific values of $\theta$ for parameters
    
    - $i$: Index of the record (e.x. if $i = 46$, then 46th row)

In [None]:
def cost_function(theta, X, y):
    """Computes cost for logistic regression"""
    m = y.size
    h = logistic_hypothesis(theta, X)
    error = np.sum(-y*np.log(h)-(1-y)*np.log(1-h))
    J = (1/m)*error
    return(J)

cost_function(theta=initialTheta, X=X, y=y)

### Gradient

$\frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)}$

- Notation:

    - $\partial$: Partial derivative
    
    - $J(\theta)$: Cost given $\theta$

    - $m$: Number of records
    
    - $h_\theta$: Logistic hypothesis $(h)$ given specific values of $\theta$ for parameters
    
    - $i$: Index of the record (e.x. if $i = 46$, then 46th row)
    
We won't actually be using this function to find the optimal values of $\theta_j$, so this is just illustrating the gradient

In [None]:
def gradient(theta, X, y):
    """Computes the gradient for logistic regression"""
    m = y.size
    h = logistic_hypothesis(theta, X)
    return (1/m)*(np.dot(X.values.T, (h.subtract(y, axis=0))))

gradient(theta=initialTheta, X=X, y=y)

Finding the optimal values of $\theta_j$ for the cost function using scipy's fmin function from their optimize suite.  This is similar to MATLAB's fminunc function.

In [None]:
# Find values of theta that minimize the cost function
optimalTheta = optimize.fmin(func=cost_function,  # Function to minimize
                             x0=initialTheta,  # Initial guess
                             args=(X, y))  # Additional Arguments

# Pretty printing the obtained values for theta
print('\nOptimal Thetas:')
for theta in enumerate(optimalTheta):
    print('Theta', theta[0], ':', theta[1])
    
print('\nCost:', cost_function(optimalTheta, X, y))

Comparing the obtained parameters to what statsmodels provides

Using statsmodels instead of scikit-learn due to scikit-learn automatically regularizing the parameters.  Part one focuses on unregularized logistic regression, and part two focuses on regularized logistic regression.

In [None]:
# Training the model
model = sm.Logit(y, X)

# Outputting model parameters
model = model.fit().params
model

Those are very close!

Calculating the class probability and generating predictions of acceptance using values of $\theta_j$ obtained from the optimization function

In [None]:
# Predicting the class probability with the obtained thetas
df['ClassProbability'] = logistic_hypothesis(optimalTheta, X)

# Assigning those with a class probability above 0.5 as admitted
df['Prediction'] = np.where(df['ClassProbability'] > 0.5, 1, 0)

df.head()

Plotting the decision boundary over the data

In [None]:
# Plotting the data
sns.lmplot(x='Exam1Score', y='Exam2Score', hue='Admitted',
            data=df, fit_reg=False, markers=["x", "o"], legend=False)

# Calculating and plotting the decision boundary
decisionX = np.array([X['Exam1Score'].min(), X['Exam1Score'].max()])
decisionY = (-1/optimalTheta[2])*(optimalTheta[0] + optimalTheta[1]*decisionX)
plt.plot(decisionX, decisionY, label='Decision Boundary',
         color='black', alpha=0.8, linestyle='--')

# Adjusting the legend location
plt.legend(bbox_to_anchor=(1.05, 0.6), loc=2, borderaxespad=0.)

---

**Part Two:** Logistic regression with regularization

In [None]:
df = pd.read_csv(path+'ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])

# Inserting additional column for the intercept
df['x0'] = 1

X = df[['x0', 'Test1', 'Test2']]
y = df['Accepted']

df.head()

In [None]:
sns.lmplot(x='Test1', y='Test2', hue='Accepted',
            data=df, fit_reg=False, markers=['x', 'o'])
plt.xlabel('Microchip Test 1')
plt.ylabel('Microchip Test 2')

### Regularized Cost Function

$J(\theta) = \frac{1}{m} \sum_{i=1}^m[-y^{(i)}log(h_\theta(x^{(i)})) - (1-y^{(i)})log(1-h_\theta(x^{(i)}))] + \frac{\lambda}{2m} \sum_{j=1}^n \theta_j^2$

**Note:** $\theta_0$ should not be regularized

In [None]:
def regularized_cost(theta, X, y, C):
    """Computes cost for a regularized logistic regression"""
    m = y.size
    h = logistic_hypothesis(theta, X)
    error = np.sum(-y*np.log(h)-(1-y)*np.log(1-h))
    
    # Calculating the regularization penalty
    # Avoiding the regularization penalty for the first theta
    regularizedTheta = [x**2 for x in theta[1:]]
    regularization = (C/(2*m))*np.sum(regularizedTheta)
    
    J = (1/m)*error + regularization
    return(J)    

# Testing how cost differs with regularization
# Using part 1 optimal thetas to test if the function is working
# If using initialTheta, there would be no impact since the values are 0
print(regularized_cost(optimalTheta, X, y, 0))
print(regularized_cost(optimalTheta, X, y, 100))

### Regularized Gradient

$\frac{\partial J(\theta)}{\partial \theta_j} = \Big(\frac{1}{m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)}\Big) + \frac{\lambda}{m}\theta_j \hspace{0.5cm}$for $j \geq 1$

In [None]:
gradient(optimalTheta, X, y)

In [None]:
# Training the model
model = sm.Logit(y, X)

# Outputting model parameters
model = model.fit().params
model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2', C=1.0)
model.fit(X, y)
model.coef_