In [None]:
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.stats import norm, gaussian_kde
import KDEpy
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

In [None]:
data = pd.read_csv('data_KDE_2D.csv')
data

In [None]:
print("Data Describe:\r\n", data.describe())
print("\r\nData Head:\r\n", data.head())

xval = np.arange(len(data.index))
plt.scatter(data['X'], data['Y'])

#### Find an appropriate k for KMeans clustering: Method seen [towardsdatascience](https://towardsdatascience.com/elbow-method-is-not-sufficient-to-find-best-k-in-k-means-clustering-fc820da0631d)

In [None]:
# Instantiate the clustering model and visualizer
km = KMeans(random_state=42)
visualizer = KElbowVisualizer(km, k=(2,10))

visualizer.fit(data)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

### No clear elbow point can be found! Let´s try the silhouette method. (Observing datapoints by eye: k should be around 2)

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(15,8))
for i in [2, 3, 4, 5, 6, 7]:
    '''
    Create KMeans instances for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(data)

"""
The following conditions should be checked to pick the right ‘K’ using the Silhouette plots:

1.  For a particular K, all the clusters should have a Silhouette score more than the average score of the dataset (represented by a red dotted line).
    The x-axis represents the Silhouette score. (Hold for all cluster!)
2.  There should not be wide fluctuations in the size of the clusters. The width of the clusters represents the number of data points.
    For K = 2, the best fit is shown, since other ks vary in size
"""

In [None]:
# So take k=2

kmean = KMeans(n_clusters=2).fit(data)
labels = kmean.predict(data)
data['label'] = labels
plt.scatter(data['X'], data['Y'], cmap='viridis', c=labels)
plt.show()

In [None]:
x = data['X']
y = data['Y']

# Generate 2D histogram (H contains the count)
H, xedges, yedges = np.histogram2d(x, y, bins=10)

# Create 3D plot
X, Y = np.meshgrid(xedges, yedges)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Count')
ax.set_title('Histogram')
ax.plot_surface(X[:-1, :-1], Y[:-1, :-1],H, cmap='viridis')
ax.view_init(elev=45, azim=320)

plt.show()