In [None]:
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Create color maps
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ['darkorange', 'c', 'darkblue']

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# Let's make some data

We saw a linear classification problem that did NOT work out.  To make life a bit easier, let's try a linear regression problem and then extend what we learned to a linear classification problem that is more or less linearly separable.  


In [None]:
np.random.seed(10)

N = 100 # number of points per class

X = np.linspace(0.0,1,N) 
y = np.linspace(0.0,1,N) 

for j in range(N):
  yy = 1.1*X[j]+np.random.randn()*0.2-0.1
  y[j] = yy

plt.scatter(X, y, c="red", s=40, cmap=plt.cm.Spectral)
plt.xlim([-0.5,1.5])
plt.ylim([-0.5,1.5])
#fig.savefig('spiral_raw.png')

# Let's perform linear regression on this data

This is a pretty straight forward example!  We want a linear function with two variables

$f(X) = W_1 \cdot X_1 + b$

Here W is a weight and b is what's known as a bias variable.  You can just think of these as the slope and offset.  We also have the "targets" or truth labels for the data.  This is just the variable on the y-axis.

Let's do this "by hand" using mean squared error first, using gradient descent.  Then we can revisit the problem to see if we get the "right" answer!

In [None]:
#Linear regression

# initialize parameters randomly
W = 0.5
b = 0

# some hyperparameters
step_size = 0.5  # why not make this 1.0?
reg = 1e-3 # regularization strength

Niter = 200

# gradient descent loop
num_examples = X.shape[0]
for i in range(Niter):
  
    # evaluate function values
    # fhat = W^TX +b
    fhat = W*X + b 
    
    # compute mean squared loss
    data_loss = np.sum((y-fhat)**2)
    data_loss /= N
    
    # This is L2 regularization, see Lecture 5
    reg_loss = 0.5*reg*W*W
    
    # Total loss is the sum
    loss = reg_loss + data_loss
    
    if i % 10 == 0:
        print("iteration %d: loss %f" % (i, loss))
        
    #compute the loss gradients
    dW = -2*np.sum(X*(y-fhat))
    db = -2*np.sum(y-fhat)
    dW /= N
    db /= N
    
    dW += reg*W # regularization gradient
    
    W -= step_size*dW
    b -= step_size*db

In [None]:
#Just to make it clear what we're doing, let's peek at the weights
print("Slope:\n",W)
print("\nOffset:\n",b)


# Reflection, part 1:

OK, that's pretty close to waht we expected!  We should have found slope = 1.1 and offset = -0.1.  But the noise in the problem will definitely give us some stochasticity in our results.  We should always make plots to validate that we're getting what we expect.  Let's do that here.


In [None]:
# let's plot our data to see if it makes sense!
plt.scatter(X, y, c="red", s=40, cmap=plt.cm.Spectral)
plt.xlim([-0.5,1.5])
plt.ylim([-0.5,1.5])

line = X*W + b
plt.plot(X,line,color="blue")
plt.show()

In [None]:
# Now let's make a plot of the residuals!
# Here we use a useful package called seaborn
# This effectively subtracts the prediction from the data
# Then it does a fit of a polynomial to the data

import seaborn as sns

sns.residplot(x=line, y=y, lowess=True, color="b")

# Reflection, part 2:

That was a fair amount of code to write for a simple linear regression.  If you recall the lecture slides, you'll note that in the case of mean square error loss we have an analytic solution.  On your homework you'll get the chance to use linear algebra to solve this problem in <5 lines!!




# Moving on to linear classification!

Now let's "invent" some data that should be more or less linearly separable.  In this way we can now revisit our approach that gave us a very marginal linear classification problem.

In [None]:
np.random.seed(20)

N = 150 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes

X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')

slopeDiff = 0
offsetDiff = 1.0

for j in range(K):
  ix = range(N*j,N*(j+1))
  xx = np.linspace(0,1,N) 
  yy = (1+slopeDiff*j)*xx + np.random.randn(N)*(0.2+j*0.05) + j*offsetDiff 
  X[ix] = np.c_[xx,yy]
  y[ix] = j

plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-0.5,1.5])
plt.ylim([-0.5,4])
plt.show()

In [None]:
#Train a Linear Classifier

# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

Niter = 200

# gradient descent loop
num_examples = X.shape[0]
for i in range(Niter):
  
    # evaluate class scores, [N x K]
    # fhat = W^TX +b
    scores = np.dot(X, W) + b 
      
    # compute the class probabilities
    # this is softmax, converting to class probability
    # prob1 = exp(score1)/{exp(score1)+exp(score2)+...}
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
      
    # compute the loss: average cross-entropy loss and regularization
    # sometimes cross-entropy is called "log loss"
    correct_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(correct_logprobs)/num_examples

    # This is L2 regularization, see Lecture 5
    reg_loss = 0.5*reg*np.sum(W*W)
    
    # Total loss is distance between the line and the data point
    # plus the L2 reg loss
    loss = data_loss + reg_loss

    if i % 10 == 0:
        print("iteration %d: loss %f" % (i, loss))
  
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y] -= 1
    dscores /= num_examples
  
    # backpropate the gradient to the parameters (W,b)
    dW = np.dot(X.T, dscores)
    db = np.sum(dscores, axis=0, keepdims=True)
  
    dW += reg*W # regularization gradient
  
    # perform a parameter update
    W += -step_size * dW
    b += -step_size * db

In [None]:
#Just to make it clear what we're doing, let's peek at the weights
print("Weight matrix:\n",W)
print("\nBias matrix:\n",b)


# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print('\n training accuracy: %.2f' % (np.mean(predicted_class == y)))


In [None]:
#Let's look to see what we got!
# Our 2D weights/variables define a plane

#make some x-values
xyMnx = 1.2
X_test = np.linspace(-0.5,1.5,N)
Y_test = np.linspace(-0.5,4,N)

Xt, Yt = np.meshgrid(X_test, Y_test)

#2D plane separating class 1 from class 2
Z1 = Xt*W[0,0] + Yt*W[1,0] + b[0,0]


#this line follows the Z=0 contour.  How did we find that??
line1 = (W[0,0]*X + b[0,0])/(-1*W[1,0])


cp = plt.contourf(Xt, Yt, Z1,10, cmap=plt.cm.plasma, alpha=0.9)
plt.colorbar(cp)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.plot(X,line1,color="blue")
plt.xlim([-0.5,1.5])
plt.ylim([-0.5,4])
plt.show()

In [None]:
# Let's make some comparisons!
# This line should be exactly between the populations in space
line2 = (1+slopeDiff*0.5)*X + offsetDiff*0.5

fig = plt.figure()
ax = fig.add_subplot(111)

# Why is line1 not equal to line2??
plt.plot(X,line2,color='r',label="Equidistant")
plt.plot(X,line1,color='b',label="Linear Regression")
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)

hand, labl = ax.get_legend_handles_labels()
handout = [hand[0],hand[2]]
lablout = [labl[0],labl[2]]
plt.legend(handout,lablout)
plt.xlim([-0.5,1.5])
plt.ylim([-0.5,4])
plt.show()

In [None]:
# What we'd really like, though, is to classify space where
# each class score has the highest probability.
# This divides the space into Voronoi regions

h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

# Finally, let's take a look at what we're really doing here.
This linear classifier gives us a way to map a position in 2D space into a 1D space with "as optimal as possible" separation between classes.  We can envision this by using our function $f(X) = W^TX+b$ to calculate a single test statistic value for each entry/event.  We have three classes and can now just histogram each class to see the separation.

In [None]:
# We have N x 3 and need the first column split by 
Xplt = np.zeros((N,3))
Xplt[:,0] = scores[:N,0]
Xplt[:,1] = scores[N:2*N,0]
Xplt[:,2] = scores[2*N:3*N,0]

plt.hist(Xplt,30)
plt.show()