In [378]:
# You will use data set 1 (with no missing values) for simple clustering.  
# Select 2 clustering algorithms (or more) from a library. 
# Run your algorithms on the data set. 
# Explore and compare using interactive visualizations. 
# Discuss what's different on each, what their parameters are impacting, what's missing, etc... . 

In [379]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AgglomerativeClustering
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import widgetbox, column, row
from bokeh.models import Select, CustomJS, ColumnDataSource, HoverTool
from bokeh.models.widgets import Select

In [380]:
output_notebook()

In [381]:
# Prepare Dataset
attr = ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
channel = ['Horeca', 'Retail']
x_train = pd.read_csv("Wholesale customers data.csv")

# According to HW2, I think the Region data are useless, because most of the data are in 'Other' region.
# Then I chose to delete the 'Region' column to lower the dimensions.
# As for the channel, I think it a different type of data, which is different to all others consumption data.
# So, I leave it alone.
x_train.pop('Region')
x_channel = x_train.pop('Channel')

x = [range(1,7) for i in range(440)]
y = [ x_train.loc[i].tolist() for i in range(440)]
color = ['blue' for i in range(440)]

k = [[0 for i in range(10)] for j in range(440)]
k1 = [[0 for i in range(10)] for j in range(440)]
k2 = [[0 for i in range(10)] for j in range(440)]

counter = [0 for i in range(440)]

# KMeans
for i in range(2,11):
    p = KMeans(n_clusters=i).fit_predict(x_train)
    for j in range(440):
        k1[j][i-1] = p[j]
# Agglomerative
for i in range(2,11):
    p = AgglomerativeClustering(n_clusters=i).fit_predict(x_train)
    for j in range(440):
        k2[j][i-1] = p[j]
source = ColumnDataSource(data={'x':x, 'y':y, 'color':color, 'k':k1, 'k1':k1, 'k2':k2, 'c':counter})

In [382]:
#Parallel Coordinates
plot = figure(plot_width=800, plot_height=600, x_axis_label = 'Attribute', y_axis_label = 'Number')
plot.background_fill_color = "lightgrey"
plot.background_fill_alpha = 0.2

plot.multi_line(xs='x', ys='y', color='color', alpha=0.4, line_width=2, source=source)

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    color = data["color"]
    var cl = new Array("blue","red","green","yellow","darkred","purple","balck","indigo","orange","cyan")
    switch(cb_obj.value){
        case "KMeans": data['k'] = data['k1']; break;
        case "Agglomerative": data['k'] = data['k2']; break;
        case "1": for(i=0;i<440;i++){data['c'][0]=0} break;
        case "2": for(i=0;i<440;i++){data['c'][0]=1} break;
        case "3": for(i=0;i<440;i++){data['c'][0]=2} break;
        case "4": for(i=0;i<440;i++){data['c'][0]=3} break;
        case "5": for(i=0;i<440;i++){data['c'][0]=4} break;
        case "6": for(i=0;i<440;i++){data['c'][0]=5} break;
        case "7": for(i=0;i<440;i++){data['c'][0]=6} break;
        case "8": for(i=0;i<440;i++){data['c'][0]=7} break;
        case "9": for(i=0;i<440;i++){data['c'][0]=8} break;
        case "10":for(i=0;i<440;i++){data['c'][0]=9} break;
    } 
    for(i=0;i<440;i++){color[i] = cl[data['k'][i][data['c'][0]]]}
    source.change.emit();
""")

select1 = Select(title="Clustering Algorithm:", value="KMeans", options=["KMeans", "Agglomerative"])
select2 = Select(title="Number of Clusters:", value="1", options=['1','2','3','4','5','6','7','8','9','10'])
select1.js_on_change('value', callback)
select2.js_on_change('value', callback)

show(column(row(widgetbox(select1,width=200),widgetbox(select2,width=200)),plot))