In [194]:
import pandas, numpy, tensorflow, math
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import widgetbox, column, gridplot, row, layout
from bokeh.models import Select, CustomJS, ColumnDataSource, Circle, Line, HoverTool
from bokeh.models.widgets import CheckboxGroup, DataTable, DateFormatter, TableColumn

In [195]:
output_notebook()

In [196]:
# Prepare Dataset
attr = ['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
region = ['Lisbon', 'Oporto', 'Other']
channel = ['Horeca', 'Retail']
origin_df = pandas.read_csv("Wholesale customers data.csv")
missing_df = pandas.read_csv("Wholesale customers data-missing.csv")

x = range(1,441)

missing_source = {'x': x}
for i in range(0,8):
    missing_source[attr[i]] = missing_df[attr[i]].tolist()
    missing_source['_'+attr[i]] = missing_df[attr[i]].tolist()

origin_source = {'x': x}
for i in range(0,8):
    origin_source[attr[i]] = origin_df[attr[i]].tolist()
    origin_source['_'+attr[i]] = origin_df[attr[i]].tolist()

source1 = ColumnDataSource(data=missing_source)
source2 = ColumnDataSource(data=origin_source)

In [197]:
# At first, I want to get a general view of the whole dataset.
# So, I draw all the data, try to analyze the relation over these attributes
# I devided those data with different regions and different channels
# Interaction using javascript callback

plot = figure(plot_width=800, plot_height=400,
              x_axis_label = 'Index', y_axis_label = 'Value')
plot.x_range.start = 0
plot.x_range.end = 440
plot.y_range.start = 0
plot.y_range.end = 120000
plot.xgrid.visible = False

plot.circle('x','Fresh', source = source1, size=5, color="red", alpha=0.5, legend='FRESH')
plot.circle('x', 'Milk', source = source1, size=5, color="gold", alpha=0.5, legend='MILK')
plot.circle('x', 'Grocery', source = source1, size=5, color="aqua", alpha=0.5, legend='GROCERY')
plot.circle('x', 'Frozen', source = source1, size=5, color="green", alpha=0.5, legend='FROZEN')
plot.circle('x', 'Detergents_Paper', source = source1, size=5, color="navy", alpha=0.5, legend='DETERGENTS PAPER')
plot.circle('x', 'Delicassen', source = source1, size=5, color="purple", alpha=0.5, legend='DELICASSEN')

plot.legend.location = "top_left"
plot.legend.click_policy="hide"

callback = CustomJS(args=dict(source=source1), code="""
    var data = source.data;
    x = data['x']
    
    Fre = data['Fresh'];
    Mi = data['Milk'];
    Gr = data['Grocery'];
    Fro = data['Frozen'];
    Det = data['Detergents_Paper'];
    Del = data['Delicassen'];
    
    for (i = 0; i < x.length; i++){
        Fre[i] = data['_Fresh'][i];
        Mi[i] = data['_Milk'][i];
        Gr[i] = data['_Grocery'][i];
        Fro[i] = data['_Frozen'][i];
        Det[i] = data['_Detergents_Paper'][i];
        Del[i] = data['_Delicassen'][i];
    }
    
    for (j = 0; j < 5; j++){
        if(cb_obj.active.indexOf(j) == -1){
            switch(j){
                case 0:
                    for (i = 0; i < x.length; i++){
                    if (data['Region'][i]==1){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                        Gr[i] = NaN;
                        Fro[i] = NaN;
                        Det[i] = NaN;
                        Del[i] = NaN;
                    }}
                    break;
                case 1:
                    for (i = 0; i < x.length; i++){
                    if (data['Region'][i]==2){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                        Gr[i] = NaN;
                        Fro[i] = NaN;
                        Det[i] = NaN;
                        Del[i] = NaN;
                    }}
                    break;
                case 2:
                    for (i = 0; i < x.length; i++){
                    if (data['Region'][i]==3){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                        Gr[i] = NaN;
                        Fro[i] = NaN;
                        Det[i] = NaN;
                        Del[i] = NaN;
                        }}   
                    break;
                case 3:
                    for (i = 0; i < x.length; i++){
                    if (data['Channel'][i]==1){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                        Gr[i] = NaN;
                        Fro[i] = NaN;
                        Det[i] = NaN;
                        Del[i] = NaN;
                    }}
                    break;
                case 4:
                    for (i = 0; i < x.length; i++){
                    if (data['Channel'][i]==2){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                        Gr[i] = NaN;
                        Fro[i] = NaN;
                        Det[i] = NaN;
                        Del[i] = NaN;
                    }}
                    break;
            }
        }
    }
    source.change.emit();
""")

filters = CheckboxGroup(
    labels=["Region: Lisbon", "Region: Oporto", "Region: Other", "Channel: Horeca", "Channel: Retail"],
    active=[0, 1, 2, 3, 4],width=150)
filters.js_on_change('active', callback)

show(row(filters, plot))

In [198]:
# I can click different legend to hide/show selected type of data
# And use checkbox to filter different group of data.
# However, those points seems messy... 
# I have no idea what relations they may have...

In [None]:
# Then I try to reduce the scope: to find the relation between each 2 attributes.

plot2 = figure(plot_width=600, plot_height=600)

plot2.circle('Fresh','Milk',source = source1,size=5, color="blue",alpha=0.5)

x_callback = CustomJS(args=dict(source=source1), code="""
    var data = source.data;
    var k = cb_obj.value;
    x = data['Fresh'];
    for (i = 0; i < x.length; i++) {
        x[i] = data['_'+k][i]
    }
    source.change.emit();
""")

x_select = Select(title="X Axis:", value="Fresh", options=attr[2:])
x_select.js_on_change('value', x_callback)

y_callback = CustomJS(args=dict(source=source1), code="""
    var data = source.data;
    var k = cb_obj.value;
    y = data['Milk'];
    for (i = 0; i < y.length; i++) {
        y[i] = data['_'+k][i]
    }
    source.change.emit();
""")

y_select = Select(title="Y Axis:", value="Milk", options=attr[2:])
y_select.js_on_change('value', y_callback)

# This "callback2" function works the way like the "callback" function's

callback2 = CustomJS(args=dict(source=source1), code="""
    var data = source.data;
    Fre = data['Fresh'];
    Mi = data['Milk'];
    
    for (i = 0; i < Fre.length; i++){
        if (Fre[i] == data['_Fresh'][i]){x = data['_Fresh'];break;}
        else if (Fre[i] == data['_Milk'][i]){x = data['_Milk'];break;}
        else if (Fre[i] == data['_Grocery'][i]){x = data['_Grocery'];break;}
        else if (Fre[i] == data['_Frozen'][i]){x = data['_Frozen'];break;}
        else if (Fre[i] == data['_Detergents_Paper'][i]){x = data['_Detergents_Paper'];break;}
        else if (Fre[i] == data['_Delicassen'][i]){x = data['_Delicassen'];break;}
    }
    for (i = 0; i < Fre.length; i++){
        if (Mi[i] == data['_Fresh'][i]){y = data['_Fresh'];break;}
        else if (Mi[i] == data['_Milk'][i]){y = data['_Milk'];break;}
        else if (Mi[i] == data['_Grocery'][i]){y = data['_Grocery'];break;}
        else if (Mi[i] == data['_Frozen'][i]){y = data['_Frozen'];break;}
        else if (Mi[i] == data['_Detergents_Paper'][i]){y = data['_Detergents_Paper'];break;}
        else if (Mi[i] == data['_Delicassen'][i]){y = data['_Delicassen'];break;}
    }
    for (i = 0; i < Fre.length; i++){
        Fre[i] = x[i];
        Mi[i] = y[i]
    }
    
    for (j = 0; j < 5; j++){
        if(cb_obj.active.indexOf(j) == -1){
            switch(j){
                case 0:
                    for (i = 0; i < x.length; i++){
                    if (data['Region'][i]==1){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                    }}
                    break;
                case 1:
                    for (i = 0; i < x.length; i++){
                    if (data['Region'][i]==2){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                    }}
                    break;
                case 2:
                    for (i = 0; i < x.length; i++){
                    if (data['Region'][i]==3){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                        }}   
                    break;
                case 3:
                    for (i = 0; i < x.length; i++){
                    if (data['Channel'][i]==1){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                    }}
                    break;
                case 4:
                    for (i = 0; i < x.length; i++){
                    if (data['Channel'][i]==2){
                        Fre[i] = NaN;
                        Mi[i] = NaN;
                    }}
                    break;
            }
        }
    }
    source.change.emit();
""")

filters2 = CheckboxGroup(
    labels=["Region: Lisbon", "Region: Oporto", "Region: Other", "Channel: Horeca", "Channel: Retail"],
    active=[0, 1, 2, 3, 4],width=150)
filters2.js_on_change('active', callback2)

show(row(column(x_select, y_select, filters2), plot2))

In [210]:
# Em....
# Look likes hard to say a clear relations they have...

In [204]:
# Alright, what about just finding those missing data point first... and their fact values.

missing_points = {'type':[], 'index':[], 'region':[], 'channel':[], 'fact':[]}
for i in range(0, 8):
    for j in range (0, 440):
        if math.isnan(missing_source[attr[i]][j]):
            missing_points['type'].append(attr[i])
            missing_points['index'].append(j)
            missing_points['region'].append(region[missing_source['Region'][j]-1])
            missing_points['channel'].append(channel[missing_source['Channel'][j]-1])
            missing_points['fact'].append(origin_source[attr[i]][j])

source3 = ColumnDataSource(missing_points)

columns = [ TableColumn(field="type", title="Type"),
            TableColumn(field="index", title="Index"),
            TableColumn(field="region", title="Region"),
            TableColumn(field="channel", title="Channel"),
            TableColumn(field="fact", title="Fact Value") ]
data_table = DataTable(source=source3, columns=columns, height=200)

show(widgetbox(data_table))