# Lab 5: Clustering

In [None]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans

from scipy.cluster import hierarchy



In this lab we will explore both k-means clustering and hierarchical clustering. First we will work with some synthetic data that we generate, and in the lab assignment you will work with baseball data. 

### K-Means Clustering

First we generate some synthetic data to work with. 

Here we random generate some values by sampling from a Normal distribution. There are 50 observations, with 2 features each. 

We then shift some of the observations around. For the first 25 observations, we increase their first feature values by 3. For the last 25 observations, we decrease their second feature values by 4. 

In [None]:
# Generate data
np.random.seed(2)
X = np.random.standard_normal((50,2))
plt.figure()
plt.scatter(X[:,1], X[:,0])
plt.show()


X[:25,0] = X[:25,0]+3
X[:25,1] = X[:25,1]-4

plt.figure()
plt.scatter(X[:,1], X[:,0])
plt.show()

df = pd.DataFrame(X, columns=['X', 'Y'])
df.head()
df.to_csv('sample-data.csv')

#### First we will try K-means with K = 2

In [None]:
km1 = KMeans(n_clusters=2, n_init=20)
km1.fit(X)

In [None]:
km1.labels_

See plot for K=2 below.

#### Now we will try K = 3

In [None]:
np.random.seed(4)
km2 = KMeans(n_clusters=3, n_init=20)
km2.fit(X)

In [None]:
pd.Series(km2.labels_).value_counts()

In [None]:
km2.cluster_centers_

In [None]:
km2.labels_

In [None]:
# Sum of distances of samples to their closest cluster center.
km2.inertia_

Let's visualize the clusterings with k=2 and k=3, and show the cluster centroids. 

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,5))

ax1.scatter(X[:,0], X[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism) 
ax1.set_title('K-Means Clustering Results with K=2')
ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)

ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) 
ax2.set_title('K-Means Clustering Results with K=3')
ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2);

### Hierarchical Clustering

For hierarchical clustering, we will use scipy instead of sklearn.

We will try three types of linkages. 

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],
                                [ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage');

We can get cluster assignments by cutting the tree. A strength of hierarchical clustering is that we can choose different numbers of clusters from the tree. Here we show cluster assignments with two clusters and with three clusters. 

In [None]:
cuts_two = hierarchy.cut_tree(hierarchy.complete(X), n_clusters=[2])
print(cuts_two)

In [None]:
cuts_three = hierarchy.cut_tree(hierarchy.complete(X), n_clusters=[3])
print(cuts_three)

# Lab Assignment

For your lab assignment, you will do clustering with the baseball data. Carry out the following steps. 
* Use just the features 'Hits' and 'PutOuts' from the Hitters data.
* Scale the features. 
* Create a scatterplot showing 'Hits' vs. 'PutOuts.'
* Do K-Means clustering with k=2.
* Do K-Means clustering with k=3.
* Create two new scatterplots showing the clusterings with k=2 and k=3.