In [25]:
# You will use data set 1 (with no missing values) for simple clustering.  
# Select 2 clustering algorithms (or more) from a library. 
# Run your algorithms on the data set. 
# Explore and compare using interactive visualizations. 
# Discuss what's different on each, what their parameters are impacting, what's missing, etc... . 

In [26]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AgglomerativeClustering
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import widgetbox, column, row
from bokeh.models import Select, CustomJS, ColumnDataSource, HoverTool
from bokeh.models.widgets import Select, CheckboxGroup

In [27]:
output_notebook()

In [28]:
# Prepare Dataset
attr = ['Channel','Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
channel = ['Horeca', 'Retail']
x_train = pd.read_csv("Wholesale customers data.csv")

# According to HW2, I think the Region data are useless, because most of the data are in 'Other' region.
# Then I chose to delete the 'Region' column to lower the dimensions.
# As for the channel, I think it a different type of data, which is different to all others consumption data.
# So, I leave it alone.
x_train.pop('Region')
x_channel = x_train.pop('Channel')

x = [range(1,7) for i in range(440)]
y = [ x_train.loc[i].tolist() for i in range(440)]
color = ['blue' for i in range(440)]

k = [[0 for i in range(10)] for j in range(440)]
k1 = [[0 for i in range(10)] for j in range(440)]
k2 = [[0 for i in range(10)] for j in range(440)]

counter = [0 for i in range(440)]

# KMeans
for i in range(2,11):
    p = KMeans(n_clusters=i).fit_predict(x_train)
    for j in range(440):
        k1[j][i-1] = p[j]
# Agglomerative
for i in range(2,11):
    p = AgglomerativeClustering(n_clusters=i).fit_predict(x_train)
    for j in range(440):
        k2[j][i-1] = p[j]
source = ColumnDataSource(data={'x':x, 'y':y, 'color':color, 'k':k1, 'k1':k1, 'k2':k2, 'c':counter})

In [29]:
# Parallel Coordinates
plot = figure(plot_width=800, plot_height=600, x_axis_label = 'Attribute', y_axis_label = 'Number')
plot.background_fill_color = "lightgrey"
plot.background_fill_alpha = 0.2

plot.multi_line(xs='x', ys='y', color='color', alpha=0.4, line_width=2.5, source=source)

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    color = data["color"]
    var cl = new Array("blue","red","green","yellow","darkred","purple","black","indigo","orange","cyan")
    switch(cb_obj.value){
        case "KMeans": data['k'] = data['k1']; break;
        case "Agglomerative": data['k'] = data['k2']; break;
        case "1": data['c'][0]=0; break;
        case "2": data['c'][0]=1; break;
        case "3": data['c'][0]=2; break;
        case "4": data['c'][0]=3; break;
        case "5": data['c'][0]=4; break;
        case "6": data['c'][0]=5; break;
        case "7": data['c'][0]=6; break;
        case "8": data['c'][0]=7; break;
        case "9": data['c'][0]=8; break;
        case "10":data['c'][0]=9; break;
    } 
    for(i=0;i<440;i++){color[i] = cl[data['k'][i][data['c'][0]]]}
    source.change.emit();
""")

select1 = Select(title="Clustering Algorithm:", value="KMeans", options=["KMeans", "Agglomerative"])
select2 = Select(title="Number of Clusters:", value="1", options=['1','2','3','4','5','6','7','8','9','10'])
select1.js_on_change('value', callback)
select2.js_on_change('value', callback)

show(column(plot,row(widgetbox(select1,width=250),widgetbox(select2,width=200))))

In [30]:
# That's called Parallel Coordinates. 
# I like it because it can provide a general view of data clustering. 
# One line represnts one data point with multip-attributes. X-axis for the attribute and Y-axis for the number.
# We can see that different clusters are represented by different color.
# However, that's a broad view of the whole data set.
# It could be too genaral sometimes. We may omit some important details.

In [31]:
# So, to reduce the scope: to find the relation between each 2 attributes.
# We can use scatter points and cluster them.

x_train = pd.read_csv("Wholesale customers data.csv")

data = {}
for i in range(0,7):
    data[attr[i]] = x_train[attr[i]].tolist()
    
data['x'] = data['Fresh']
data['y'] = data['Milk']
data['c'] = counter
data['k'] = k1
data['k1'] = k1
data['k2'] = k2
data['x1'] = data['x']
data['x2'] = [float('nan') for i in range(440)]
data['x3'] = [float('nan') for i in range(440)]
data['x4'] = [float('nan') for i in range(440)]
data['x5'] = [float('nan') for i in range(440)]
data['x6'] = [float('nan') for i in range(440)]
data['x7'] = [float('nan') for i in range(440)]
data['x8'] = [float('nan') for i in range(440)]
data['x9'] = [float('nan') for i in range(440)]
data['x10']= [float('nan') for i in range(440)]
data['y1'] = data['y']
data['y2'] = [float('nan') for i in range(440)]
data['y3'] = [float('nan') for i in range(440)]
data['y4'] = [float('nan') for i in range(440)]
data['y5'] = [float('nan') for i in range(440)]
data['y6'] = [float('nan') for i in range(440)]
data['y7'] = [float('nan') for i in range(440)]
data['y8'] = [float('nan') for i in range(440)]
data['y9'] = [float('nan') for i in range(440)]
data['y10'] = [float('nan') for i in range(440)]

source2 = ColumnDataSource(data=data)

plot2 = figure(plot_width=800, plot_height=600)
plot2.background_fill_color = "lightgrey"
plot2.background_fill_alpha = 0.2

plot2.circle('x1' ,'y1' ,source = source2, size=5, color="blue", alpha=0.5)
plot2.circle('x2' ,'y2' ,source = source2, size=5, color="red", alpha=0.5)
plot2.circle('x3' ,'y3' ,source = source2, size=5, color="green", alpha=0.5)
plot2.circle('x4' ,'y4' ,source = source2, size=5, color="yellow", alpha=0.5)
plot2.circle('x5' ,'y5' ,source = source2, size=5, color="darkred", alpha=0.5)
plot2.circle('x6' ,'y6' ,source = source2, size=5, color="purple", alpha=0.5)
plot2.circle('x7' ,'y7' ,source = source2, size=5, color="black", alpha=0.5)
plot2.circle('x8' ,'y8' ,source = source2, size=5, color="indigo", alpha=0.5)
plot2.circle('x9' ,'y9' ,source = source2, size=5, color="orange", alpha=0.5)
plot2.circle('x10','y10',source = source2, size=5, color="cyan", alpha=0.5)

x_callback = CustomJS(args=dict(source=source2), code="""
    var data = source.data;
    var n = cb_obj.value;
    x = data['x']
    x = data[n];
    y = data['y'];
    for (i = 0; i < x.length; i++) {
        data['x1'][i] = NaN; data['x2'][i] = NaN; data['x3'][i] = NaN; data['x4'][i] = NaN; data['x5'][i] = NaN;
        data['x6'][i] = NaN; data['x7'][i] = NaN; data['x8'][i] = NaN; data['x9'][i] = NaN; data['x10'][i] = NaN;
        data['y1'][i] = NaN; data['y2'][i] = NaN; data['y3'][i] = NaN; data['y4'][i] = NaN; data['y5'][i] = NaN;
        data['y6'][i] = NaN; data['y7'][i] = NaN; data['y8'][i] = NaN; data['y9'][i] = NaN; data['y10'][i] = NaN;
    }
    for (i = 0; i< x.length; i++){
        switch(data['k'][i][data['c'][0]]){
        case 0: data['x1'][i]=x[i];data['y1'][i]=y[i];break;
        case 1: data['x2'][i]=x[i];data['y2'][i]=y[i];break;
        case 2: data['x3'][i]=x[i];data['y3'][i]=y[i];break;
        case 3: data['x4'][i]=x[i];data['y4'][i]=y[i];break;
        case 4: data['x5'][i]=x[i];data['y5'][i]=y[i];break;
        case 5: data['x6'][i]=x[i];data['y6'][i]=y[i];break;
        case 6: data['x7'][i]=x[i];data['y7'][i]=y[i];break;
        case 7: data['x8'][i]=x[i];data['y8'][i]=y[i];break;
        case 8: data['x9'][i]=x[i];data['y9'][i]=y[i];break;
        case 9: data['x10'][i]=x[i];data['y10'][i]=y[i];break;    
        }
    }
    source.change.emit();
""")

y_callback = CustomJS(args=dict(source=source2), code="""
    var data = source.data;
    var n = cb_obj.value;
    x = data['x']
    y = data['y']
    y = data[n];
    for (i = 0; i < x.length; i++) {
        data['x1'][i] = NaN; data['x2'][i] = NaN; data['x3'][i] = NaN; data['x4'][i] = NaN; data['x5'][i] = NaN;
        data['x6'][i] = NaN; data['x7'][i] = NaN; data['x8'][i] = NaN; data['x9'][i] = NaN; data['x10'][i] = NaN;
        data['y1'][i] = NaN; data['y2'][i] = NaN; data['y3'][i] = NaN; data['y4'][i] = NaN; data['y5'][i] = NaN;
        data['y6'][i] = NaN; data['y7'][i] = NaN; data['y8'][i] = NaN; data['y9'][i] = NaN; data['y10'][i] = NaN;
    }
    for (i = 0; i< x.length; i++){
        switch(data['k'][i][data['c'][0]]){
        case 0: data['x1'][i]=x[i];data['y1'][i]=y[i];break;
        case 1: data['x2'][i]=x[i];data['y2'][i]=y[i];break;
        case 2: data['x3'][i]=x[i];data['y3'][i]=y[i];break;
        case 3: data['x4'][i]=x[i];data['y4'][i]=y[i];break;
        case 4: data['x5'][i]=x[i];data['y5'][i]=y[i];break;
        case 5: data['x6'][i]=x[i];data['y6'][i]=y[i];break;
        case 6: data['x7'][i]=x[i];data['y7'][i]=y[i];break;
        case 7: data['x8'][i]=x[i];data['y8'][i]=y[i];break;
        case 8: data['x9'][i]=x[i];data['y9'][i]=y[i];break;
        case 9: data['x10'][i]=x[i];data['y10'][i]=y[i];break;    
        }
    }
    source.change.emit();
""")

callback1 = CustomJS(args=dict(source=source2), code="""
    var data = source.data;
    x = data['x']
    y = data['y']
    switch(cb_obj.value){
    case "KMeans": data['k'] = data['k1']; break;
    case "Agglomerative": data['k'] = data['k2']; break;
    case "1": data['c'][0]=0; break;
    case "2": data['c'][0]=1; break;
    case "3": data['c'][0]=2; break;
    case "4": data['c'][0]=3; break;
    case "5": data['c'][0]=4; break;
    case "6": data['c'][0]=5; break;
    case "7": data['c'][0]=6; break;
    case "8": data['c'][0]=7; break;
    case "9": data['c'][0]=8; break;
    case "10":data['c'][0]=9; break;
    }
    for (i = 0; i < x.length; i++) {
        data['x1'][i] = NaN; data['x2'][i] = NaN; data['x3'][i] = NaN; data['x4'][i] = NaN; data['x5'][i] = NaN;
        data['x6'][i] = NaN; data['x7'][i] = NaN; data['x8'][i] = NaN; data['x9'][i] = NaN; data['x10'][i] = NaN;
        data['y1'][i] = NaN; data['y2'][i] = NaN; data['y3'][i] = NaN; data['y4'][i] = NaN; data['y5'][i] = NaN;
        data['y6'][i] = NaN; data['y7'][i] = NaN; data['y8'][i] = NaN; data['y9'][i] = NaN; data['y10'][i] = NaN;
    }
    for (i = 0; i< x.length; i++){
        switch(data['k'][i][data['c'][0]]){
        case 0: data['x1'][i]=x[i];data['y1'][i]=y[i];break;
        case 1: data['x2'][i]=x[i];data['y2'][i]=y[i];break;
        case 2: data['x3'][i]=x[i];data['y3'][i]=y[i];break;
        case 3: data['x4'][i]=x[i];data['y4'][i]=y[i];break;
        case 4: data['x5'][i]=x[i];data['y5'][i]=y[i];break;
        case 5: data['x6'][i]=x[i];data['y6'][i]=y[i];break;
        case 6: data['x7'][i]=x[i];data['y7'][i]=y[i];break;
        case 7: data['x8'][i]=x[i];data['y8'][i]=y[i];break;
        case 8: data['x9'][i]=x[i];data['y9'][i]=y[i];break;
        case 9: data['x10'][i]=x[i];data['y10'][i]=y[i];break;    
        }
    }
    source.change.emit();
""")

callback2 = CustomJS(args=dict(source=source2), code="""
    var data = source.data;
    x = data['x'];
    y = data['y'];
    for (i = 0; i < x.length; i++) {
        data['x1'][i] = NaN; data['x2'][i] = NaN; data['x3'][i] = NaN; data['x4'][i] = NaN; data['x5'][i] = NaN;
        data['x6'][i] = NaN; data['x7'][i] = NaN; data['x8'][i] = NaN; data['x9'][i] = NaN; data['x10'][i] = NaN;
        data['y1'][i] = NaN; data['y2'][i] = NaN; data['y3'][i] = NaN; data['y4'][i] = NaN; data['y5'][i] = NaN;
        data['y6'][i] = NaN; data['y7'][i] = NaN; data['y8'][i] = NaN; data['y9'][i] = NaN; data['y10'][i] = NaN;
    }
    for (i = 0; i< x.length; i++){
        switch(data['k'][i][data['c'][0]]){
        case 0: data['x1'][i]=x[i];data['y1'][i]=y[i];break;
        case 1: data['x2'][i]=x[i];data['y2'][i]=y[i];break;
        case 2: data['x3'][i]=x[i];data['y3'][i]=y[i];break;
        case 3: data['x4'][i]=x[i];data['y4'][i]=y[i];break;
        case 4: data['x5'][i]=x[i];data['y5'][i]=y[i];break;
        case 5: data['x6'][i]=x[i];data['y6'][i]=y[i];break;
        case 6: data['x7'][i]=x[i];data['y7'][i]=y[i];break;
        case 7: data['x8'][i]=x[i];data['y8'][i]=y[i];break;
        case 8: data['x9'][i]=x[i];data['y9'][i]=y[i];break;
        case 9: data['x10'][i]=x[i];data['y10'][i]=y[i];break;    
        }
    }
    for (j = 0; j < 2; j++){
        if(cb_obj.active.indexOf(j) == -1){
            switch(j){
                case 0:
                    for (i = 0; i < x.length; i++){
                    if (data['Channel'][i]==1){
                        data['x1'][i] = NaN; data['x2'][i] = NaN; data['x3'][i] = NaN; data['x4'][i] = NaN; data['x5'][i] = NaN;
                        data['x6'][i] = NaN; data['x7'][i] = NaN; data['x8'][i] = NaN; data['x9'][i] = NaN; data['x10'][i] = NaN;
                        
                    }}
                    break;
                case 1:
                    for (i = 0; i < x.length; i++){
                    if (data['Channel'][i]==2){
                        data['x1'][i] = NaN; data['x2'][i] = NaN; data['x3'][i] = NaN; data['x4'][i] = NaN; data['x5'][i] = NaN;
                        data['x6'][i] = NaN; data['x7'][i] = NaN; data['x8'][i] = NaN; data['x9'][i] = NaN; data['x10'][i] = NaN;
                    }}
                    break;
            }
        }
    }
    source.change.emit();
""")

x_select = Select(title="X Axis:", value="Fresh", options=attr[1:])
x_select.js_on_change('value', x_callback)

y_select = Select(title="Y Axis:", value="Milk", options=attr[1:])
y_select.js_on_change('value', y_callback)

t_select = Select(title="Clustering Algorithm:", value="KMeans", options=["KMeans", "Agglomerative"])
t_select.js_on_change('value', callback1)
n_select = Select(title="Number of Clusters:", value="1", options=['1','2','3','4','5','6','7','8','9','10'])
n_select.js_on_change('value', callback1)

filters = CheckboxGroup(
    labels=["Channel: Horeca", "Channel: Retail"],
    active=[0, 1],width=150)
filters.js_on_change('active', callback2)

show(column(plot2,row(x_select, y_select), row(t_select, n_select, filters)))

In [32]:
# Scatter points are more interesting. They looks better... 
# Also, we have more selection now. We can choose any 2 attributes that get our attention.
# But it cannot help us find the relation among 3 or more attributes. we should be careful.