# Bokeh Kernel Density

In [95]:
# Import data
import pandas as pd
df_business = pd.read_csv("../data/cleaned_businessV2.csv")

In [96]:
# Helper functions
from datetime import datetime, timedelta

def get_opening_float(time_interval):
    opening_time = time_interval.split("-")
    opening_hour, opening_minute = opening_time[0].split(":")
    opening_time_float = float(opening_hour) + float(opening_minute) / 60.0
    return opening_time_float

def get_closing_float(time_interval):
    opening_time = time_interval.split("-")
    closing_hour, closing_minute = opening_time[1].split(":")
    closing_time_float = float(closing_hour) + float(closing_minute) / 60.0
    return closing_time_float

def get_open_duration_float(time_interval):
    # Split the string into start and end times
    start_time_str, end_time_str = time_interval.split('-')

    # Convert the strings to datetime objects
    start_time = datetime.strptime(start_time_str, "%H:%M")
    end_time = datetime.strptime(end_time_str, "%H:%M")
    
    # Adjust for intervals that cross midnight
    if end_time <= start_time:
        end_time += timedelta(days=1)

    # Calculate the difference in hours and return as a float
    time_difference = end_time - start_time

    hours = time_difference.total_seconds() / 3600
    return abs(hours)

test_time_interval = "5:00-5:00"

print("test_opening:", get_opening_float(test_time_interval))
print("get_closing_float:", get_closing_float(test_time_interval))
print("test_duration:", get_open_duration_float(test_time_interval))



test_opening: 5.0
get_closing_float: 5.0
test_duration: 24.0


In [97]:
# Define kinds of restaurants we are interested in. May need to delete this later
# to allow the user to define this with UI
categories_of_interest = ['Chinese', 'Japanese', 'Italian', 'Polish', 'Scandinavian']

# Convert column types to string
df_business = df_business.convert_dtypes()
#print(f"df_business.dtypes: \n{df_business.dtypes}")

# Create new column containing a specific category of interest. 
# If not in interest, label the column value "Other"
df_business['category_of_interest'] = "Other"
for item in categories_of_interest:
    df_business.loc[df_business['categories'].str.contains(item), 'category_of_interest'] = item

# Define the days of the weeks for iteration
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
rating_groups = ["Rating 1-2", "Rating 2-3", "Rating 3-4", "Rating 4-5"]

df_business["Rating_Group"] = pd.cut(df_business["stars"], bins=[1,2,3,4,5], labels=rating_groups)

for day in weekdays:
    # Drop the columns where shops are closed
    df_business = df_business[df_business["hours_" + day] != "Closed"]
    # Create columns for our x-values
    df_business[day + "_Hour_Of_Opening_Float"] = df_business["hours_" + day ].apply(get_opening_float)
    # Create columns for our y-values
    df_business[day + "_Open_Duration_Float"] = df_business["hours_" + day].apply(get_open_duration_float)

df_business.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,Wednesday_Hour_Of_Opening_Float,Wednesday_Open_Duration_Float,Thursday_Hour_Of_Opening_Float,Thursday_Open_Duration_Float,Friday_Hour_Of_Opening_Float,Friday_Open_Duration_Float,Saturday_Hour_Of_Opening_Float,Saturday_Open_Duration_Float,Sunday_Hour_Of_Opening_Float,Sunday_Open_Duration_Float
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,7.0,13.0,7.0,13.0,7.0,14.0,7.0,14.0,7.0,14.0
3,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4.0,65,...,16.0,8.0,12.0,12.0,12.0,14.0,11.0,15.0,11.0,13.0
9,O1oZpbZNDMH_gz8DhsZCdA,Wendy's,700 E. Hunting Park,Philadelphia,PA,19124,40.012141,-75.115015,1.5,15,...,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0


## Kernel Density Plot Using Bokeh

In [98]:
# Helper function to concatenate all the "<day>_Hour_Of_Opening_Float" columns into 
# one column called "Concatenated_Hour_Of_Opening_Float" 
# and do the same for "<day>_Open_Duration_Float". Return a df with columns 
# "Concatenated_Hour_Of_Opening_Float", "Concatenated_Open_Duration_Float".

# The idea is that we only want to create a kernel density plot on the background
# of the points that we are seeing on the scatterplot. In other words: we do not wish
# to calculate a kernel density plot based on data points we have filtered away or
# turned off the visibility of.
def get_concatenated_x_and_y_from_days(df_source, days):
    df = df_source.__deepcopy__()
    # In order to avoid a warning, we will initialise the 
    # accumulator series to be the column of the first day in days
    series_acc_x = df[days[0] + "_Hour_Of_Opening_Float"]
    series_acc_y = df[days[0] + "_Open_Duration_Float"]
    # Get all x values
    for i, day in enumerate(days):
        # Skip the first element as we initialised the series_acc_x and series_acc_y to 
        # have the values of the first columns of the days
        if i == 0:
            continue
        series_acc_x = pd.concat([series_acc_x, df[day + "_Hour_Of_Opening_Float"]])
        series_acc_y = pd.concat([series_acc_y, df[day + "_Open_Duration_Float"]])
    return pd.DataFrame({'Concatenated_Hour_Of_Opening_Float': series_acc_x,
                         'Concatenated_Open_Duration_Float': series_acc_y})

test_data = df_business.head(3)
print(test_data["Saturday_Hour_Of_Opening_Float"])
print("should be concatenated with")
print(test_data["Sunday_Hour_Of_Opening_Float"])
print("-------------------------------------------------------------")
print(test_data["Saturday_Open_Duration_Float"])
print("should be concatenated with")
print(test_data["Sunday_Open_Duration_Float"])
test_days = ["Saturday", "Sunday"]

print("Test result:")
test = get_concatenated_x_and_y_from_days(test_data, test_days)
print(test)

0     7.0
3    11.0
9    10.0
Name: Saturday_Hour_Of_Opening_Float, dtype: float64
should be concatenated with
0     7.0
3    11.0
9    10.0
Name: Sunday_Hour_Of_Opening_Float, dtype: float64
-------------------------------------------------------------
0    14.0
3    15.0
9    16.0
Name: Saturday_Open_Duration_Float, dtype: float64
should be concatenated with
0    14.0
3    13.0
9    16.0
Name: Sunday_Open_Duration_Float, dtype: float64
Test result:
   Concatenated_Hour_Of_Opening_Float  Concatenated_Open_Duration_Float
0                                 7.0                              14.0
3                                11.0                              15.0
9                                10.0                              16.0
0                                 7.0                              14.0
3                                11.0                              13.0
9                                10.0                              16.0


In [None]:
from bokeh.io import output_notebook

import numpy as np
from scipy.stats import gaussian_kde

from bokeh.palettes import Blues9
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg import autompg as df

output_notebook()

def kde(x, y, N):
    xmin, xmax = x.min(), x.max()
    ymin, ymax = y.min(), y.max()

    X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return X, Y, Z

# Color palettes from colorbrewer
cb_greens = ['#f7fcf5','#e5f5e0','#c7e9c0','#a1d99b','#74c476','#41ab5d','#238b45','#006d2c','#00441b']
cb_blues = ['#f7fbff','#deebf7','#c6dbef','#9ecae1','#6baed6','#4292c6','#2171b5','#08519c','#08306b']
cb_purples = ['#fcfbfd','#efedf5','#dadaeb','#bcbddc','#9e9ac8','#807dba','#6a51a3','#54278f','#3f007d']
cb_oranges = ['#fff5eb','#fee6ce','#fdd0a2','#fdae6b','#fd8d3c','#f16913','#d94801','#a63603','#7f2704']

def kde_plot(fig, x, y, color_palette):
    x, y, z = kde(x, y, 100)
    levels = np.linspace(np.min(z), np.max(z), 10)
    contour = fig.contour(x, y, z, levels[1:], 
              fill_color=color_palette, 
              line_color=color_palette)
    return contour

fig = figure(height=400, x_axis_label="Hour of Opening", y_axis_label="Duration",
           background_fill_color="white", tools="", toolbar_location=None,
           title="Opening Hours vs Hours Remained Open")

# TODO: Let the values of the list "test_days" be controlled by the widgets 
test_days = ["Monday", "Sunday"]
df_concatenated_hours_and_durations = get_concatenated_x_and_y_from_days(df_business, test_days)

c_plot = kde_plot(fig=fig, 
                      x = df_concatenated_hours_and_durations["Concatenated_Hour_Of_Opening_Float"], 
                      y = df_concatenated_hours_and_durations["Concatenated_Open_Duration_Float"], 
                      color_palette=cb_greens)


fig.grid.level = "overlay"
fig.grid.grid_line_color = "black"
fig.grid.grid_line_alpha = 0.05

# Make the legend interactive. Hide data of a certain legend item upon click.
#p.legend.click_policy = "hide"

# Add the legend as a layout item to the right of the plot.
#p.add_layout(p.legend, 'right')

show(fig)

In [100]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': ["H", "I", "J"],
    'D': ["K", "L", "M"],
})

# Concatenate rows of A and B into a single column C
df_X = pd.concat([df['A'], df['B']], ignore_index=True)
df_Y = pd.concat([df['C'], df['D']], ignore_index=True)

df = pd.DataFrame({
    'X': df_X,
    'Y': df_Y

})

print(df)

   X  Y
0  1  H
1  2  I
2  3  J
3  4  K
4  5  L
5  6  M


In [101]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'Monday_Openings': ["H", "I", "J"],
    'Tuesday_Openings': ["K", "L", "M"],
})

wd = ["Monday", "Tuesday"]

series_acc = pd.Series()

for i, day in enumerate(wd):
    series_acc = pd.concat([series_acc, df[day + "_Openings"]])

print(series_acc)


0    H
1    I
2    J
0    K
1    L
2    M
dtype: object
