In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas
import plotly.figure_factory as ff

In [2]:
# load data
file = "new_iris_data.csv"
df_iris = pd.read_csv(file)
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [3]:
# Standardize data with StandardScaler

iris_scaled = StandardScaler().fit_transform(df_iris)
print(iris_scaled[0:5])

[[-0.90068117 -1.3412724   1.03205722 -1.31297673]
 [-1.14301691 -1.3412724  -0.1249576  -1.31297673]
 [-1.38535265 -1.39813811  0.33784833 -1.31297673]
 [-1.50652052 -1.2844067   0.10644536 -1.31297673]
 [-1.02184904 -1.3412724   1.26346019 -1.31297673]]


In [4]:
# Initialize PCA model
pca = PCA(n_components=2)

In [13]:
# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

In [14]:
# Transform PCA data to a DF

df_iris_pca = pd.DataFrame(data=iris_pca, columns=["principal component 1", "principal component 2"])
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [15]:
# After PCA has been applied, it is time to run the hierarchical clustering algorithm.
# We start by creating a dendrogram.
# We'll pass a color_threshold of 0 to make all the branches the same color:

# Create the dendrogram
fig = ff.create_dendrogram(df_iris_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [16]:
# Now it is up to us to determine how many clusters we want to make. 
# Remember, the higher the horizontal lines, the less similarity there is between the clusters. 
#We know the iris dataset contains three clusters. The cutoff will be set at five to obtain three clusters:
# We knew ahead of time the number of clusters to make; however, the cutoff line on the dendrogram seems high in terms of distances. 
# This is one of the difficulties when using a dendrogram.

agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_iris_pca)

In [17]:
# This will set up our model, and since you're working with a dataset that you're already familiar with, there should be three clustered groups we decided previously, so three will be passed into the n_clusters parameter. Then the model is fit against your df_iris_pcaDataFrame.
# Add a class column that will be used to identify the clusters

df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [18]:
# Finally, create a plot to show the results of the hierarchical clustering algorithm:

df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
    )

## Re-run the algorithm using different cutoffs from the dendrogram.

In [22]:
# agg = AgglomerativeClustering(n_clusters=4)
agg = AgglomerativeClustering(n_clusters=6)
model = agg.fit(df_iris_pca)

In [23]:
df_iris_pca["class"] = model.labels_
df_iris_pca.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0


In [24]:
df_iris_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
    )