# Machine Learning (Summer 2018)

## Practice Session 10

June, 19th 2018

Ulf Krumnack

Institute of Cognitive Science
University of Osnabrück

## Plan for the next sessions

* New exercises: Sheet 11
* Classifiers

# Classifiers

* datasets
* k nearest neighbor
* lines
* linear classifiers

## Generating a dataset

A dataset for classification consists of two parts:
* a list of feature vectors, usually denoted by $x$
* a list of corresponding class labels, usually denoted as $c$, $y$, or $t$

Exercise:
1. Generate a 2-dimensional dataset consisting of two classes (positive and negative examples),
   both parts being normally distributed (use `np.random.multivariate_normal`). The result should be of shape (N,3), with the last column `data[:,-1]` providing the labels (either 0 or 1).
2. Plot your dataset, showing both classes in different colors.

In [None]:
# Simple version: just points, no class labels

import numpy as np
import matplotlib.pyplot as plt

n0 = 50
mean0 = [0, 0]
cov0 = [[1, 0], [0, 6]]

n1 = 40
mean1 = [6, 6]
cov1 = [[1, 0], [0, 6]]

x0, y0 = np.random.multivariate_normal(mean0, cov0, n0).T
x1, y1 = np.random.multivariate_normal(mean1, cov1, n1).T

plt.figure()
plt.axis('equal')
plt.scatter(x0, y0)
plt.scatter(x1, y1)
plt.show()

In [None]:
# Second version: dataset with class labels

import numpy as np
import matplotlib.pyplot as plt

n0 = 50
mean0 = [0, 0]
cov0 = [[1, 0], [0, 12]]

n1 = 40
mean1 = [6, 10]
cov1 = [[1, 0], [0, 12]]

# Create dataset of shape (n1+n2,3)
#data=
### BEGIN SOLUTION
neg = np.hstack([np.random.multivariate_normal(mean0, cov0, n0),
                 np.zeros((n0,1))])
pos = np.hstack([np.random.multivariate_normal(mean1, cov1, n1),
                 np.ones((n1,1))])

data = np.vstack([neg,pos])
### END SOLUTION

assert data.shape == (n0+n1,3), "data has invalid shape {}".format(data.shape)

plt.figure()
plt.axis('equal')
### BEGIN SOLUTION
plt.scatter(data[:,0],data[:,1], c=data[:,2])
### END SOLUTION
plt.show()

## Nearest Neighbor Classification

*Exercises:*
1. Implement a Euclidean distance function (`euclidean_distance`).
1. implement a function `nearest_neighbor`, that finds in your datasest the nearest neighbor for a given point `p`
1. plot your result (indicating the point and the nearest neighbor). Try different coordinates for `p`

In [None]:
import numpy as np

### BEGIN SOLUTION
euclidean_distance = lambda x,y : np.sqrt(((y-x)**2).sum(axis=-1))
euclidean_distance2 = lambda x,y : np.linalg.norm(x-y,axis=-1)
### END SOLUTION

p = np.asarray([1,3])
q = np.asarray([4,7])

# Check your results for the points (1,3) and (4,7) - distance should be 5.
assert np.round(euclidean_distance(p,q), 3) == 5., "distance between {} and {} is wrong: {}".format(p,q,euclidean_distance(p,q))
assert np.all(euclidean_distance([p,q],q) == [5,0]) , "distance can not combine arrays with scalars"


In [None]:
def nearest_neighbor(data, predict):
    # data is of shape (N,3):
    #   data[i,0:2] are features, data[i,2] is the value
    # predict is of shape (2,)
    #   the features of a new data point
    ### BEGIN SOLUTION
    min_distance = float("inf")
    nearest = None
    for d in data:
        dist = euclidean_distance(d[:-1],predict)
        if dist < min_distance:
            nearest = d
            min_distance = dist
    return nearest
    ### END SOLUTION
    
# The one-liner (requires an appropriate distance function!):
def nearest_neighbor2(data, predict):
    return data[np.argmin(euclidean_distance(data[:,:-1],predict))]


In [None]:
p = np.asarray((3,4))
nn = nearest_neighbor2(data,p)

plt.figure()
plt.title("new point {} -> nearest neighbor {}".format(p,nn))
plt.axis('equal')
### BEGIN SOLUTION
plt.scatter(data[:,0],data[:,1], c=data[:,2])
plt.plot(*p, '*', c='red')
plt.plot(*nn[:-1], 'o', c='green')
### END SOLUTION

Exercise: Now implement $k$-nearest neighbor.

Hint: you may use a list to collect neighbors and `sorted()` to find the nearest ones.

Question:
* does increasing $k$ mean that the accuracy goes up?

In [None]:
def k_nearest_neighbors(data, predict, k=3):
    """
    data of shape (N,3)
    predict of shape (2,)
    k - the number of neighbors
    
    """
    ### BEGIN SOLUTION
    distances = []
    for d in data:
        dist = euclidean_distance(d[:-1], predict)
        distances.append([dist,d])
    return [i[1] for i in sorted(distances)[:k]]
    ### END SOLUTION

# The one-liner ;-)
def k_nearest_neighbors2(data, predict, k=3):
    return data[np.argsort(euclidean_distance(data[:,:-1], predict))[:k]]

    
p = np.asarray((3,5))
k = 5
neighbors = k_nearest_neighbors2(data,p,k)

plt.figure()
plt.title("new point {} -> {} nearest neighbors".format(p,k))
plt.axis('equal')
plt.scatter(data[:,0],data[:,1], c=data[:,2])
plt.plot(*p, '*', c='red')
### BEGIN SOLUTION
for nn in neighbors:
    plt.plot(*nn[:-1], 'o', c='green')
### END SOLUTION
plt.show()

<a id="lines"></a>
## Lines

Lines (and hyperplanes) play a crucial role in many machine learning approaches (e.g. as linear separatrices). 

In school, lines are usually represented as functions

$$y = m\cdot x + y_0$$

Exercise:
1. Plot a line using matplotlib (on the interval [-10,10])
1. What do the two parameters $m$ and $y_0$ specify?
1. Where does the line intersect with the $x$- and the $y$-axis?
1. How to check if a point $\vec{p}=(x,y)$ is on/above/below the line?

In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

m = .5
y0 = 3

### BEGIN SOLUTION
x = np.linspace(-10,10,2)
y = m * x + y0

plt.figure()
plt.ylim([-10,10])
plt.plot(x, y)
plt.show()
### END SOLUTION

ad 2. What do the two parameters $m$ and $y_0$ specify?

The value $m$ is the slope of the line, i.e. the ratio $\frac{\Delta x}{\Delta y}$ and $y_0$ is the vertical offset where the line intersects the $y$-axis.

ad 3. Where does the line intersect with the $x$- and the $y$-axis?

Solve $0 = m\cdot x + y_0$ to get the intersection with the $x$-axis: $x=y_0/m$.
Hence the intersection points are $(y_0/m,0)$ and $(0,y_0)$

ad 4. How to check if a point $\vec{p}=(x,y)$ is on the line?

The point is on the line, iff the coordinates fulfill the equation $y = m\cdot x + y_0$.

#### A more general description of a line

However, this representation has some disadvantages:
* it can not express vertical lines
* it is not obvious how to generalize to more dimensions

Hence one uses a more general form:

$$ a\cdot x + b\cdot y + c = 0 $$

Exercises:
1. Draw the line for the given values of $a,b,c$. Also try different values.
1. What parameters do you have to choose for horizontal and vertical lines? Can you draw them with your code?
1. Use the values $m$ and $y_0$ from the previous example to initialize $a,b,c$ to get the same line as in that example.
1. There are many triples $(a,b,c)$ that describe the same line. Can you find two of them? Can you give a criterion to check if two triples are equivalent?

In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

a = 2
b = -2
c = 0

### BEGIN SOLUTION
def my_line(a,b,c):
    if abs(b) > abs(a):
        x = np.linspace(-10,10,2)
        y = -(a*x + c)/b
    else:
        y = np.linspace(-10,10,2)
        x = -(b*y + c)/a
    return x,y

x,y = my_line(a,b,c)
### END SOLUTION

plt.figure()
plt.ylim([-10,10])
plt.plot(x, y)
plt.show()

#### A line specified by a normal vector

Using vector notation, $\vec{n} = (a,b)$ and $\vec{p} = (x,y)$ one can state the equation

$$ a\cdot x + b\cdot y + c = 0 $$

more compact as

$$\langle \vec{n},\vec{p}\rangle + c = 0$$

where $\langle \_,\_ \rangle$ denotes the inner product (dot product).

Exercises:
1. Show that $\vec{n}$ is a normal vector, i.e., that it is orthogonal to the line.
1. Can you locate the point $\vec{p}_0$ on the line that is closest to the origin?
1. Plot the line and the point $\vec{p}_0$ on the line.
1. What interpretation can be given to the value $c$?


ad 1. Take two points $\vec{p}_1$ and $\vec{p}_2$ on the line. The difference $(\vec{p}_2-\vec{p}_1)$ will then point into the direction of the line. Compute
\begin{align}
\langle \vec{n},(\vec{p}_2-\vec{p}_1)\rangle
&= \langle \vec{n},\vec{p}_2\rangle - \langle \vec{n},\vec{p}_1\rangle \\
&= (\langle \vec{n},\vec{p}_2\rangle + c) - (\langle \vec{n},\vec{p}_1\rangle + c) \\
&= 0-0 \\
&= 0
\end{align}

ad 2. The connection of $\vec{p}_0$ and the origin $\vec{0}$ is orthogonal to the line and hence
 $\vec{p}_0 = s\cdot\vec{n}$ for some $s\in\mathbb{R}$. Compute $s$: as $\vec{p}_0$ is on the line, we know
 
 $$0 = \langle \vec{n},\vec{p}_0\rangle + c = \langle \vec{n},s\cdot\vec{n}\rangle +c = s\cdot \langle \vec{n},\vec{n}\rangle +c = s\cdot \|\vec{n}\|_2^2 +c $$
and hence
 $$\vec{p}_0 = s\cdot\vec{n} = -c\frac{\vec{n}}{\|\vec{n}\|_2^2}$$

In [None]:
# ad 3.
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

n = np.asarray([.3,.5])
c = 3

# Compute point p0 = ...
### BEGIN SOLUTION
p0 = -c * n/n.dot(n)
### END SOLUTION

def my_line2(n,c):
    if abs(n[1])> abs(n[0]):
        x = np.linspace(-10,10,2)
        y = -(n[0]*x + c)/n[1]
    else:
        y = np.linspace(-10,10,2)
        x = -(n[1]*y + c)/n[0]
    return x,y

x,y = my_line2(n,c)

plt.figure()
plt.axes().set_aspect('equal')
plt.ylim([-10,10])
plt.plot(x, y)
plt.plot(*p0,'*')
plt.text(*p0,'({:4.2f},{:4.2f})'.format(p0[0],p0[1]))
plt.plot(0,0, '*k') # origin
plt.annotate(s='', xy=p0, xytext=(0,0), arrowprops=dict(arrowstyle='<->'))
plt.text(*(.5*p0),'d={:4.2f}'.format(c/np.sqrt(n.dot(n))))
plt.show()

ad 4. What interpretation can be given to the value $c$?

From the formula

$$\vec{p}_0 = -c\frac{\vec{n}}{\|\vec{n}\|_2^2}$$
 
we can derive

$$ d := \|\vec{p}_0\|_2 = \left\|-c\frac{\vec{n}}{\|\vec{n}\|_2^2}\right\|_2
= |c|\cdot \frac{\|\vec{n}\|_2}{\|\vec{n}\|_2^2} = \frac{|c|}{\|\vec{n}\|_2}$$

that is, we can recover the closest distance $d$ of the line from the origin by dividing $c$ by $\|\vec{n}\|_2$. In the special case of a normalized $\vec{n}$ (i.e., $\|\vec{n}\| = 1$), the value $|c|$ provides this distance directly.

#### A line specified by normal vector and point

Instead of providing the value $c$ one could specify a line by the normal $\vec{n}$ and one point $\vec{p}$ on that line.

Exercises:
1. How can you recover the value $c$ from $\vec{n}$ and $\vec{p}$?
1. Plot the point $\vec{p}$, the normal $\vec{n}$, the line, the origin, and the point $\vec{p}_0$ into one graph.

ad 1. As $\vec{p}$ is on the line, it has to fulfill

$$\langle \vec{n},\vec{p}\rangle + c = 0$$

and hence

$$c = - \langle \vec{n},\vec{p}\rangle$$


In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

p = np.array([1, 3])
n = np.array([1, -3])

### BEGIN SOLUTION
c = -n.dot(p)
p0 = -c * n/n.dot(n)
o = np.zeros(2) # the origin
### END SOLUTION

x,y = my_line2(n,c)

plt.figure()
plt.axes().set_aspect('equal')
plt.ylim([-10,10])
plt.plot(*o,'*k')
plt.plot(*p,'or')
plt.arrow(*p, *n, fc='m', ec='m', head_width=.3, head_length=.4)
plt.plot(x, y)
plt.plot(*zip(o,p0),'g')
plt.show()

### The higher dimensional case

* A $D$-dimensional space is separated into two parts by a hyperplane
  (i.e. a $(D-1)$-dimensional subspace)
* A hyperplane can be described by a point and a normal vector.
* In a $2$-dimensional space, a hyperplane is just a $1$-dimensional subspace (i.e. a line).
* In a $3$-dimensional space, a hpyerplane is just a $2$-dimensional subspace (i.e. a plane).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

point  = np.array([1, 2, 3])
normal = np.array([1, 1, 2])

# a plane is a*x+b*y+c*z+d=0
# [a,b,c] is the normal. Thus, we have to calculate
# d and we're set
d = -point.dot(normal)

# create x,y
xx, yy = np.meshgrid(range(10), range(10))

# calculate corresponding z
z = (-normal[0] * xx - normal[1] * yy - d) * 1. /normal[2]

# plot the surface
plt3d = plt.figure().gca(projection='3d')
plt3d.plot_surface(xx, yy, z)
plt.show()

## Euclidean classifier

*Exercise*:
1. Implement the Euclidean classifier
1. Apply it to your dataset
1. Visualize the result
1. Classify some datapoint and add it to your plot 

In [None]:
def euclidean(data):
    ### BEGIN SOLUTION
    features0 = data[data[:,2]==0][:,:-1]
    mean0 = features0.mean(0)
    features1 = data[data[:,2]==1][:,:-1]
    mean1 = features1.mean(0)

    w = mean1 - mean0
    p = mean0 + 0.5 * w
    ### END SOLUTION
    return w,p

In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

n, p = euclidean(data)

plt.figure()
plt.axes().set_aspect('equal')
plt.axis('equal')

### BEGIN SOLUTION
c = -n.dot(p)
p0 = -c * n/n.dot(n)
o = np.zeros(2) # the origin
x,y = my_line2(n,c)

plt.plot(*o,'*k')
plt.plot(*p,'or')
plt.arrow(*p, *n, fc='m', ec='m', head_width=.3, head_length=.4)
plt.scatter(data[:,0],data[:,1], c=data[:,2])
plt.plot(x, y)
#plt.plot(*zip(o,p0),'g')
### END SOLUTION
plt.show()

## LDA

*Exercise*:
1. Implement the LDA (ML-09, slide 11) Hint: you may use `np.cov`, `np.linalg.inv`, and `np.dot` (`@`)
1. Apply it to your dataset (make sure, your dataset fullfills the conditions)
1. Visualize the result
1. Classify some datapoint and add it to your plot 

In [None]:
def LDA(data):
    ### BEGIN SOLUTION
    features0 = data[data[:,2]==0][:,:-1]
    mean0 = features0.mean(0)
    sigma = np.cov(features0.T)
    sigma_inv = np.linalg.inv(sigma)

    features1 = data[data[:,2]==1][:,:-1]
    mean1 = features1.mean(0)

    w = np.linalg.inv(sigma) @ (mean1 - mean0)
    p =  0.5 * (mean1+mean0)
    ### END SOLUTION

    return w,p

In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

n, p = LDA(data)

plt.figure()
plt.axes().set_aspect('equal')
plt.axis('equal')
### BEGIN SOLUTION

c = -n.dot(p)
p0 = -c * n/n.dot(n)
o = np.zeros(2) # the origin

x,y = my_line2(n,c)

plt.plot(*o,'*k')
plt.plot(*p,'or')
plt.arrow(*p, *n, fc='m', ec='m', head_width=.3, head_length=.4)
plt.scatter(data[:,0],data[:,1], c=data[:,2])
plt.plot(x, y)
#plt.plot(*zip(o,p0),'g')
### END SOLUTION
plt.show()