# Bokeh Kernel Density

In [4]:
# Import data
import pandas as pd
from bokeh.io import output_notebook
df_business = pd.read_csv("../data/cleaned_businessV2.csv")
output_notebook()

In [5]:
# Helper functions
from datetime import datetime, timedelta

def get_opening_float(time_interval):
    opening_time = time_interval.split("-")
    opening_hour, opening_minute = opening_time[0].split(":")
    opening_time_float = float(opening_hour) + float(opening_minute) / 60.0
    return opening_time_float

def get_closing_float(time_interval):
    opening_time = time_interval.split("-")
    closing_hour, closing_minute = opening_time[1].split(":")
    closing_time_float = float(closing_hour) + float(closing_minute) / 60.0
    return closing_time_float

def get_open_duration_float(time_interval):
    # Split the string into start and end times
    start_time_str, end_time_str = time_interval.split('-')

    # Convert the strings to datetime objects
    start_time = datetime.strptime(start_time_str, "%H:%M")
    end_time = datetime.strptime(end_time_str, "%H:%M")
    
    # Adjust for intervals that cross midnight
    if end_time <= start_time:
        end_time += timedelta(days=1)

    # Calculate the difference in hours and return as a float
    time_difference = end_time - start_time

    hours = time_difference.total_seconds() / 3600
    return abs(hours)

test_time_interval = "5:00-5:00"

print("test_opening:", get_opening_float(test_time_interval))
print("get_closing_float:", get_closing_float(test_time_interval))
print("test_duration:", get_open_duration_float(test_time_interval))



test_opening: 5.0
get_closing_float: 5.0
test_duration: 24.0


In [6]:
# Define kinds of restaurants we are interested in. May need to delete this later
# to allow the user to define this with UI
categories_of_interest = ['Chinese', 'Japanese', 'Italian', 'Polish', 'Scandinavian']

# Convert column types to string
df_business = df_business.convert_dtypes()
#print(f"df_business.dtypes: \n{df_business.dtypes}")

# Create new column containing a specific category of interest. 
# If not in interest, label the column value "Other"
df_business['category_of_interest'] = "Other"
for item in categories_of_interest:
    df_business.loc[df_business['categories'].str.contains(item), 'category_of_interest'] = item

# Define the days of the weeks for iteration
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
rating_groups = ["Rating 1-2", "Rating 2-3", "Rating 3-4", "Rating 4-5"]

df_business["Rating_Group"] = pd.cut(df_business["stars"], bins=[1,2,3,4,5], labels=rating_groups)

for day in weekdays:
    # Drop the columns where shops are closed
    df_business = df_business[df_business["hours_" + day] != "Closed"]
    # Create columns for our x-values
    df_business[day + "_Hour_Of_Opening_Float"] = df_business["hours_" + day ].apply(get_opening_float)
    # Create columns for our y-values
    df_business[day + "_Open_Duration_Float"] = df_business["hours_" + day].apply(get_open_duration_float)

df_business.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,Wednesday_Hour_Of_Opening_Float,Wednesday_Open_Duration_Float,Thursday_Hour_Of_Opening_Float,Thursday_Open_Duration_Float,Friday_Hour_Of_Opening_Float,Friday_Open_Duration_Float,Saturday_Hour_Of_Opening_Float,Saturday_Open_Duration_Float,Sunday_Hour_Of_Opening_Float,Sunday_Open_Duration_Float
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,7.0,13.0,7.0,13.0,7.0,14.0,7.0,14.0,7.0,14.0
3,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4.0,65,...,16.0,8.0,12.0,12.0,12.0,14.0,11.0,15.0,11.0,13.0
9,O1oZpbZNDMH_gz8DhsZCdA,Wendy's,700 E. Hunting Park,Philadelphia,PA,19124,40.012141,-75.115015,1.5,15,...,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0


## Kernel Density Plot Using Bokeh

In [7]:
# Helper function to concatenate all the "<day>_Hour_Of_Opening_Float" columns into 
# one column called "Concatenated_Hour_Of_Opening_Float" 
# and do the same for "<day>_Open_Duration_Float". Return a df with columns 
# "Concatenated_Hour_Of_Opening_Float", "Concatenated_Open_Duration_Float".

# The idea is that we only want to create a kernel density plot on the background
# of the points that we are seeing on the scatterplot. In other words: we do not wish
# to calculate a kernel density plot based on data points we have filtered away or
# turned off the visibility of.
def get_concatenated_x_and_y_from_days(df_source, days):
    # In order to avoid a warning, we will initialise the 
    # accumulator series to be the column of the first day in days
    series_acc_x = df_source[days[0] + "_Hour_Of_Opening_Float"]
    series_acc_y = df_source[days[0] + "_Open_Duration_Float"]
    # Get all x values
    for i, day in enumerate(days):
        # Skip the first element as we initialised the series_acc_x and series_acc_y to 
        # have the values of the first columns of the days
        if i == 0:
            continue
        series_acc_x = pd.concat([series_acc_x, df_source[day + "_Hour_Of_Opening_Float"]])
        series_acc_y = pd.concat([series_acc_y, df_source[day + "_Open_Duration_Float"]])
    return pd.DataFrame({'Concatenated_Hour_Of_Opening_Float': series_acc_x,
                         'Concatenated_Open_Duration_Float': series_acc_y})

test_data = df_business.head(3)
print(test_data["Saturday_Hour_Of_Opening_Float"])
print("should be concatenated with")
print(test_data["Sunday_Hour_Of_Opening_Float"])
print("-------------------------------------------------------------")
print(test_data["Saturday_Open_Duration_Float"])
print("should be concatenated with")
print(test_data["Sunday_Open_Duration_Float"])
test_days = ["Saturday", "Sunday"]

print("Test result:")
test = get_concatenated_x_and_y_from_days(test_data, test_days)
print(test)

0     7.0
3    11.0
9    10.0
Name: Saturday_Hour_Of_Opening_Float, dtype: float64
should be concatenated with
0     7.0
3    11.0
9    10.0
Name: Sunday_Hour_Of_Opening_Float, dtype: float64
-------------------------------------------------------------
0    14.0
3    15.0
9    16.0
Name: Saturday_Open_Duration_Float, dtype: float64
should be concatenated with
0    14.0
3    13.0
9    16.0
Name: Sunday_Open_Duration_Float, dtype: float64
Test result:
   Concatenated_Hour_Of_Opening_Float  Concatenated_Open_Duration_Float
0                                 7.0                              14.0
3                                11.0                              15.0
9                                10.0                              16.0
0                                 7.0                              14.0
3                                11.0                              13.0
9                                10.0                              16.0


### Plot without Widgets

In [None]:
from bokeh.io import output_notebook

import numpy as np
from scipy.stats import gaussian_kde

from bokeh.palettes import Blues9
from bokeh.plotting import figure, show

def kde(x, y, N):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()

    X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return X, Y, Z

# Color palettes from colorbrewer
cb_greens = ['#00441b']
cb_blues = ['#08306b']
cb_purples = ['#3f007d']
cb_oranges = ['#7f2704']

colors = [cb_blues, cb_purples, cb_oranges, cb_greens]

def kde_plot(fig, x, y, color_palette):
    x, y, z = kde(x, y, 100)
    levels = np.linspace(np.min(z), np.max(z), 10)
    contour = fig.contour(x, y, z, levels[1:], 
              #fill_color=color_palette, 
              line_color=color_palette)
    return contour

fig = figure(height=400, x_axis_label="Hour of Opening", y_axis_label="Duration",
           background_fill_color="white", tools="", toolbar_location=None,
           title="Opening Hours vs Hours Remained Open")

# # TODO: Let the values of the list "test_days" be controlled by the widgets 
# test_days = ["Monday"]
# df_concatenated_hours_and_durations = get_concatenated_x_and_y_from_days(df_business, test_days)
# contour = kde_plot(fig=fig, 
#                       x = df_concatenated_hours_and_durations["Concatenated_Hour_Of_Opening_Float"], 
#                       y = df_concatenated_hours_and_durations["Concatenated_Open_Duration_Float"], 
#                       color_palette=cb_greens)

# TODO: Let the values of the list "test_days" be controlled by the widgets 
test_days = ["Monday"]
test_rating_groups = ["Rating 1-2", "Rating 4-5"]
#test_rating_groups = ["Rating 1-2", "Rating 2-3", "Rating 3-4", "Rating 4-5"]
for i, test_rating_groups in enumerate(test_rating_groups):
    df_filtered = df_business[df_business['Rating_Group'] == test_rating_groups]
    df_concatenated_hours_and_durations = get_concatenated_x_and_y_from_days(df_filtered, test_days)
    contour = kde_plot(fig=fig, 
                      x = df_concatenated_hours_and_durations["Concatenated_Hour_Of_Opening_Float"], 
                      y = df_concatenated_hours_and_durations["Concatenated_Open_Duration_Float"], 
                      color_palette=colors[i])
    contour.name = "Contour_"+ test_rating_groups
    

fig.grid.level = "overlay"
fig.grid.grid_line_color = "black"
fig.grid.grid_line_alpha = 0.05

# Add the legend as a layout item to the right of the plot.
#p.add_layout(p.legend, 'right')

show(fig)

### Plot with Widgets

In [16]:
from bokeh.io import output_notebook

import numpy as np
from scipy.stats import gaussian_kde
from bokeh.models import ColumnDataSource, CustomJS, Button, Select, Legend, LegendItem
from bokeh.palettes import Blues9
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg import autompg as df
from bokeh.models import CheckboxGroup, CustomJS
from bokeh.models.filters import CustomJSFilter
from bokeh.models import CDSView
from bokeh.layouts import row
from bokeh.plotting import figure, save, output_file, curdoc
from bokeh.palettes import Colorblind  # For Colorblind palette

contours_dict = {}

# Define colors
cb = Colorblind[8]
chosen_cb_colors = [cb[0], cb[1], cb[3], cb[6]]

colors = [cb_blues, cb_purples, cb_oranges, cb_greens]

def kde(x, y, N):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()

    X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return X, Y, Z

def kde_plot(fig, x, y, color_palette):
    x, y, z = kde(x, y, 100)
    levels = np.linspace(np.min(z), np.max(z), 6)
    contour = fig.contour(x, y, z, levels[1:], 
              #fill_color=color_palette, 
              line_color=color_palette)
    return contour

# Set up data sources
source = ColumnDataSource(df_business)

# Display checkboxes for rating group and weekdays. Let all be checked upon load
checkboxes_rating_groups = CheckboxGroup(labels=rating_groups, active=[0,3])
checkboxes_weekdays = CheckboxGroup(labels=weekdays, active=list(range(0, len(weekdays))))

# Recompute contours on click
checkboxes_weekdays.js_on_change('active', CustomJS(args=dict(source = source), code="source.change.emit();"))

fig = figure(height=400, x_axis_label="Hour of Opening", y_axis_label="Duration",
            x_range = (0,25), y_range = (0,25),
            background_fill_color="white",
            title="Opening Hours vs Opening Duration Density")

# TODO: Let the values of the list "test_days" be controlled by the widgets 
test_days = ["Monday"]
for i, rating_group in enumerate(rating_groups):
    df_filtered = df_business[df_business['Rating_Group'] == rating_group]
    df_concatenated_hours_and_durations = get_concatenated_x_and_y_from_days(df_filtered, test_days)
    contour = kde_plot(fig=fig, 
                      x = df_concatenated_hours_and_durations["Concatenated_Hour_Of_Opening_Float"], 
                      y = df_concatenated_hours_and_durations["Concatenated_Open_Duration_Float"], 
                      color_palette=chosen_cb_colors[i])
    contour.name = "Contour_"+ rating_group
    stringi = str(i)
    contours_dict.setdefault(rating_group, []).append(contour)

for key, value in contours_dict.items(): print(key, value[0].name, value[0])

# Make rating group checkboxes update the source upon click.
# Note: The following code works on the assumption that the order of the rating group checkboxes 
# ([] Rating 1-2, ...,  [] Rating 4-5) does not change. If we change that order (which 
# I do not expect we will) then the following JSCallback may break)
checkboxes_rating_groups.js_on_change("active", CustomJS(args=dict(contours_dict=contours_dict, fig=fig), code="""
    let active = cb_obj.active;                                                                                                                                        
    let keys = Object.keys(contours_dict);    
                                                                                                            
    console.log("active: ", active);
    console.log("contours_dict", contours_dict);
    console.log("keys.length", keys.length);                                                                                                     
                                                                                                                                                                                                  
    for (let i=0; i < keys.length; i++) {
        let contour_key = keys[i];                                                                                                                                                           
        let contour_plot = contours_dict[contour_key][0]; // The value of the dictionary is an array with one element
        contour_plot.visible = active.includes(i);
    }
    fig.reset.emit(); // This is what updates the view, such that when a checkbox is pressed, the figure is re-rendered                                                                                                                                                                                                                                                                  
"""))

fig.grid.level = "overlay"
fig.grid.grid_line_color = "black"
fig.grid.grid_line_alpha = 0.05

# Add the legend as a layout item to the right of the plot.
#p.add_layout(p.legend, 'right')

output_file("kernel-density-plot-toggleable.html")
show(row(fig, checkboxes_rating_groups, checkboxes_weekdays))



Rating 1-2 Contour_Rating 1-2 ContourRenderer(id='p2886', ...)
Rating 2-3 Contour_Rating 2-3 ContourRenderer(id='p2904', ...)
Rating 3-4 Contour_Rating 3-4 ContourRenderer(id='p2922', ...)
Rating 4-5 Contour_Rating 4-5 ContourRenderer(id='p2940', ...)
