In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.datasets import make_moons
from scipy.cluster.hierarchy import dendrogram
from IPython.core.display import Image

### Agglomerative Clustering Types
 * Moons

https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering

In [None]:
df = pd.DataFrame(make_moons(noise=0.05)[0], columns = [["x","y"]])
df["color"] = KMeans(n_clusters=2).fit_predict(df[["x","y"]])
df.plot.scatter(x="x",y="y", c=df["color"], vmin=-1)

In [None]:
km = KMeans(n_clusters=2).fit(df[["x","y"]])
km.cluster_centers_

In [None]:
# ward linkage (default)
df = pd.DataFrame(make_moons(noise=0.05)[0], columns = [["x","y"]])
df["color"] = AgglomerativeClustering(n_clusters=2, linkage = ???).fit_predict(df[["x","y"]])
df.plot.scatter(x="x",y="y", c=df["color"], vmin=-1)

In [None]:
# single linkage
df = pd.DataFrame(make_moons(noise=0.05)[0], columns = [["x","y"]])
df["color"] = AgglomerativeClustering(n_clusters=2, linkage=???).fit_predict(df[["x","y"]])
df.plot.scatter(x="x",y="y", c=df["color"], vmin=-1)

In [None]:
# average linkage
df = pd.DataFrame(make_moons(noise=0.05)[0], columns = [["x","y"]])
df["color"] = AgglomerativeClustering(n_clusters=2, linkage=???).fit_predict(df[["x","y"]])
df.plot.scatter(x="x",y="y", c=df["color"], vmin=-1)

In [None]:
# complete linkage 
df = pd.DataFrame(make_moons(noise=0.05)[0], columns = [["x","y"]])
df["color"] = AgglomerativeClustering(n_clusters=2, linkage=???).fit_predict(df[["x","y"]])
df.plot.scatter(x="x",y="y", c=df["color"], vmin=-1)

## Clustering Counties (Agglomerative)

In [None]:
df = gpd.read_file("counties.geojson")

In [None]:
df.head()

In [None]:
c = AgglomerativeClustering(4, compute_distances=True)
groups = c.fit_predict(df[["forest", "crops", "pasture", "developed"]])
df.plot(column=groups, cmap="tab10")

### Tree Recursion: Node Count

In [None]:
Image("children.png", width=400)

In [None]:
# c.children_

In [None]:
# get the first child
c.children_[0]

In [None]:
df.iloc[[44,62]]

In [None]:
print(len(df))

In [None]:
# print node 73
c.children_[73-len(df)]

In [None]:
c.children_

In [None]:
Image("children.png", width=400)

In [None]:
# for the subtree rooted at node_idx, how many nodes are in the subtree?
def node_count(node_idx):
    # print(node_idx)
    if node_idx < len(df):
        return 1
    else:
        left, right = c.children_[node_idx - len(df)]
        return node_count(left) + node_count(right) + 1

node_count(72)

In [None]:
node_count(len(df) + len(c.children_) - 1)

In [None]:
# c.children_

## Linkage Matrix

In [None]:
Image("linkage.png", width=400)

In [None]:
distances = c.distances_.reshape(-1, 1)
# distances 

In [None]:
counts = [node_count(node_idx) for node_idx in range(len(df), len(df)+len(c.children_))]
counts = np.array(counts).reshape(-1,1)

In [None]:
linkage = np.concatenate([
    c.children_, # columns 1 & 2 in the linkage matrix
    distances, # column 3
    counts, # column 4
], axis=1)

linkage

## Dendrogram

In [None]:
fig, ax = plt.subplots(figsize =(12,4))
dendrogram(linkage, labels=df["NAME"].values, ax=ax)
ax.tick_params(labelsize=13)
None

In [None]:
df

In [None]:
# set NAME as index
df1= df.set_index("NAME")

In [None]:
# compare the last two counties in the dendrogram to see the similarity between them
df1.loc[["Wood County", "Marquette County"]]

In [None]:
# compare the last two counties in the dendrogram with the first county to see their dissimilarity with the first county
df1.loc[["Wood County", "Marquette County", "Bayfield County"]]

## Redraw an image more simply, using only 4 colors

   - capital.jpg: https://en.wikipedia.org/wiki/Madison,_Wisconsin

In [None]:
import matplotlib.pyplot as plt
img = plt.imread("capital.jpg")
print(img.shape)
plt.imshow(img)

In [None]:
# img

In [None]:
tbl = img.reshape(-1, 3)
tbl

In [None]:
km = KMeans(4)
km.fit(tbl)

In [None]:
km.cluster_centers_

In [None]:
plt.imshow(km.cluster_centers_.reshape(1, -1, 3)/255)

In [None]:
groups = km.predict(tbl)
groups

In [None]:
km.cluster_centers_[[0,0,1,1,1]]

In [None]:
img2 = km.cluster_centers_[groups]
img2

In [None]:
plt.imshow(img2.reshape(img.shape)/255)