In [None]:
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Create color maps
cmap_list = ['orange', 'cyan', 'cornflowerblue']
cmap_bold = ['darkorange', 'c', 'darkblue']
cmap_light = ListedColormap(cmap_list)

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# Let's make some data

We want a problem that is not linearly separable to show how poorly a linear classification would work here.

Note later that centering at zero is helpful for us.  We will make three classes, but you could add more or less if you wish by adjusting the variable K.  Feel free to add more dimensions, but it's harder to view beyond 2.

In [None]:
np.random.seed(20)

N = 150 # number of points per class
D = 2 # dimensionality
K = 2 # number of classes

X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')

slopeDiff = 1
offsetDiff = 0.25

for j in range(K):
  ix = range(N*j,N*(j+1))
  xx = np.linspace(0,1,N) 
  yy = (1+slopeDiff*j)*xx + np.random.randn(N)*(0.2+j*0.05) + j*offsetDiff 
  X[ix] = np.c_[xx,yy]
  y[ix] = j

# make a copy to help calculations later on
yy = np.zeros((N*K,D))   

yy[:,0] = y
yy[:,1] = y

plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-0.5,1.5])
plt.ylim([-0.5,4])
plt.show()

# Create a logistic regression classifier

We saw that we can perform linear regression on this data sample, so this is a good opportunity to try out logistic regression.  We can use more or less the same approach as before.  In logistic regression, we're going to be trying to model the output of our function as 0 (eg, low probability to match our data) vs 1 (eg, high probability to match our data).


This time we will use an activation function to pass our linear model into.  This should not change things drastically, but we can compare and contrast the result from Linear regression.

But we need to study some activation functions first.  Let's see what we can find out about the following options:
  * Rectified Logical Unit (ReLU)
  * Sigmoid
  * SoftPlus
  * Leaky ReLU
  * Exponential ReLU (ELU)

In [None]:
# define our NN activation function here
def activation(i, input):

    if i == 0:
        # Rectified linear unit (ReLU) activation
        output = np.maximum(0,input)
    elif i == 1:
        #Sigmoid
        output =  1/(1+np.exp(-1*input))
    elif i == 2:
        # SoftPlus
        output = np.log(1+np.exp(input))
    elif i == 3:
        # Leaky ReLU
        output = np.maximum(0.1*input,0.9*input)
    elif i == 4:
        # Exponential ReLU
        output = np.where(input>0,input,np.exp(input)-1)
    else:
        return input
    
    return output

ix = np.linspace(-4,4,200)

plt.plot(ix,activation(0,ix),color="red",label="ReLU")
plt.plot(ix,activation(1,ix),color="blue",label="Sigmoid")
plt.plot(ix,activation(2,ix),color="green",label="SoftPlus")
plt.plot(ix,activation(3,ix),color="magenta",label="Leaky ReLU")
plt.plot(ix,activation(4,ix),color="orange",label="ELU")

plt.legend()
plt.grid(True)
plt.show()

In [None]:
#Train a Linear Classifier

# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))

# some hyperparameters
step_size = 0.1
reg = 1e-3 # regularization strength

iAct = 0 # choice of activation function

#Need to threshold our classes!
aThresh = 1e-2
if (iAct == 1) or (iAct == 2):
    aThresh = 0.5
    
Niter = 2000

# gradient descent loop
num_examples = X.shape[0]

for i in range(Niter):
  
    # evaluate class scores, [N x K]
    # fhat = W^TX +b
    fhat = np.dot(X, W) + b 
    
    scores = activation(iAct,fhat)

    predicted_class = np.int64(scores[:,0]>aThresh)

    # compute the loss: 
    data_loss = np.square(predicted_class - y).sum() / num_examples

    # This is L2 regularization, see Lecture 5
    reg_loss = 0.5*reg*np.sum(W*W)
   
    # Total loss is distance between the line and the data point
    # plus the L2 reg loss
    loss = data_loss + reg_loss

    if i % 10 == 0:
        print("iteration %d: loss %f" % (i, loss))
    
    # compute the gradient on scores
    dscores = 2.0 * (scores - yy)
    dscores /= num_examples
    
    
    # backpropate the gradient to the parameters (W,b)
    dW = np.dot(X.T, dscores)
    db = np.sum(dscores, axis=0, keepdims=True)
    
    dW += reg*W # regularization gradient
  
    # perform a parameter update
    W += -step_size * dW
    b += -step_size * db

In [None]:
#Just to make it clear what we're doing, let's peek at the weights
print("Weight matrix:\n",W)
print("\nBias matrix:\n",b)


# evaluate training set accuracy
scores = activation(iAct,np.dot(X, W) + b)
predicted_class = np.int64(scores[:,0]>aThresh)
print(predicted_class)
print('\n training accuracy: %.2f' % (np.mean(predicted_class == y)))



In [None]:
X_test = np.ones(N)
X_test *= 0.5
Y_test = np.linspace(-0.5,2.5,N)

line = activation(iAct,X_test*W[0,0] + Y_test*W[1,0] + b[0,0])

plt.plot(Y_test,line)

In [None]:
#Let's look to see what we got!
# Our 2D weights/variables define a plane

#make some x-values
X_test = np.linspace(-0.5,1.5,N)
Y_test = np.linspace(-0.5,2.5,N)
Xt, Yt = np.meshgrid(X_test, Y_test)


#2D plane separating class 1 from class 2
Z1 = activation(iAct,Xt*W[0,0] + Yt*W[1,0] + b[0,0])

cp = plt.contourf(Xt, Yt, Z1,100, cmap=plt.cm.Wistia, alpha=0.9)
plt.colorbar(cp)
plt.contour(Xt,Yt,Z1,[aThresh])
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)

plt.xlim([-0.5,1.5])
plt.ylim([-0.5,2.5])
plt.show()

# Reflection, part 1:

OK, that's pretty close to waht we expected!  We should have found slope = 1.1 and offset = -0.1.  But the noise in the problem will definitely give us some stochasticity in our results.  We should always make plots to validate that we're getting what we expect.  Let's do that here.

