In [121]:
# Import data
import pandas as pd
df_business = pd.read_csv("../data/cleaned_businessV2.csv")

# Prepare

In [122]:
# Helper functions
from datetime import datetime, timedelta

def get_opening_float(time_interval):
    opening_time = time_interval.split("-")
    opening_hour, opening_minute = opening_time[0].split(":")
    opening_time_float = float(opening_hour) + float(opening_minute) / 60.0
    return opening_time_float

def get_closing_float(time_interval):
    opening_time = time_interval.split("-")
    closing_hour, closing_minute = opening_time[1].split(":")
    closing_time_float = float(closing_hour) + float(closing_minute) / 60.0
    return closing_time_float

def get_open_duration_float(time_interval):
    # Split the string into start and end times
    start_time_str, end_time_str = time_interval.split('-')

    # Convert the strings to datetime objects
    start_time = datetime.strptime(start_time_str, "%H:%M")
    end_time = datetime.strptime(end_time_str, "%H:%M")
    
    # Adjust for intervals that cross midnight
    if end_time <= start_time:
        end_time += timedelta(days=1)

    # Calculate the difference in hours and return as a float
    time_difference = end_time - start_time

    hours = time_difference.total_seconds() / 3600
    return abs(hours)

test_time_interval = "5:00-5:00"

print("test_opening:", get_opening_float(test_time_interval))
print("get_closing_float:", get_closing_float(test_time_interval))
print("test_duration:", get_open_duration_float(test_time_interval))



test_opening: 5.0
get_closing_float: 5.0
test_duration: 24.0


In [123]:
# Define kinds of restaurants we are interested in. May need to delete this later
# to allow the user to define this with UI
categories_of_interest = ['Chinese', 'Japanese', 'Italian', 'Polish', 'Scandinavian']

# Convert column types to string
df_business = df_business.convert_dtypes()
#print(f"df_business.dtypes: \n{df_business.dtypes}")

# Create new column containing a specific category of interest. 
# If not in interest, label the column value "Other"
df_business['category_of_interest'] = "Other"
for item in categories_of_interest:
    df_business.loc[df_business['categories'].str.contains(item), 'category_of_interest'] = item

# Define the days of the weeks for iteration
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
rating_groups = ["Rating 1-2", "Rating 2-3", "Rating 3-4", "Rating 4-5"]

df_business["Rating_Group"] = pd.cut(df_business["stars"], bins=[1,2,3,4,5], labels=rating_groups)

for day in weekdays:
    # Drop the columns where shops are closed
    df_business = df_business[df_business["hours_" + day] != "Closed"]
    # Create columns for our x-values
    df_business[day + "_Hour_Of_Opening_Float"] = df_business["hours_" + day ].apply(get_opening_float)
    # Create columns for our y-values
    df_business[day + "_Open_Duration_Float"] = df_business["hours_" + day].apply(get_open_duration_float)

df_business.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,hours_Monday,hours_Tuesday,hours_Wednesday,hours_Thursday,hours_Friday,hours_Saturday,hours_Sunday,category_of_interest,Rating_Group,Monday_Hour_Of_Opening_Float,Monday_Open_Duration_Float,Tuesday_Hour_Of_Opening_Float,Tuesday_Open_Duration_Float,Wednesday_Hour_Of_Opening_Float,Wednesday_Open_Duration_Float,Thursday_Hour_Of_Opening_Float,Thursday_Open_Duration_Float,Friday_Hour_Of_Opening_Float,Friday_Open_Duration_Float,Saturday_Hour_Of_Opening_Float,Saturday_Open_Duration_Float,Sunday_Hour_Of_Opening_Float,Sunday_Open_Duration_Float
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",07:00-20:00,07:00-20:00,07:00-20:00,07:00-20:00,07:00-21:00,07:00-21:00,07:00-21:00,Other,Rating 3-4,7.0,13.0,7.0,13.0,7.0,13.0,7.0,13.0,7.0,14.0,7.0,14.0,7.0,14.0
3,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4.0,65,0,"Cocktail Bars, Bars, Italian, Nightlife, Resta...",16:00-00:00,16:00-00:00,16:00-00:00,12:00-00:00,12:00-02:00,11:00-02:00,11:00-00:00,Italian,Rating 3-4,16.0,8.0,16.0,8.0,16.0,8.0,12.0,12.0,12.0,14.0,11.0,15.0,11.0,13.0
9,O1oZpbZNDMH_gz8DhsZCdA,Wendy's,700 E. Hunting Park,Philadelphia,PA,19124,40.012141,-75.115015,1.5,15,1,"Burgers, Restaurants, Fast Food",10:00-02:00,10:00-02:00,10:00-02:00,10:00-02:00,10:00-02:00,10:00-02:00,10:00-02:00,Other,Rating 1-2,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0,10.0,16.0


In [124]:
import bokeh
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Colorblind 
from bokeh.models import Legend, HoverTool
import numpy as np


def show_scatter():
    # Display Bokeh plots in the notebook
    output_notebook()

    fig = figure(title="Scatterplot Detailing Opening Hours of Businesses",
            width = 800, 
            x_axis_label="Hours of Opening", 
            y_axis_label="Number of Hours Restaurant Remains Open")

    # Get the same amount of colors from the Colorblind palette as there are items in weekdays
    colors = Colorblind[len(rating_groups)]

    # Create a 2d array of size len(rating_groups)*len(weekdays)
    s = [[None for _ in range(len(weekdays))] for _ in range(len(rating_groups))]

    for i, rating_group in enumerate(rating_groups):
        df = df_business[df_business["Rating_Group"] == rating_group]
        for j, day in enumerate(weekdays):
            scatter = fig.scatter(df[day + '_Hour_Of_Opening_Float'], 
                df[day + "_Open_Duration_Float"], 
                size = 7,
                color = colors[i], 
                alpha = 0.3)
            scatter.name = rating_group + ", " + day # For sanity checking that the indices are correct. These can be printed.
            s[i][j] = scatter

    # Create separate legend to control visibility of all dots within a rating group
    list_of_touples_for_rating_groups_legend = []
    for i, rating_group in enumerate(rating_groups):
        touple_containing_name_and_associated_labels = (rating_group, s[i])
        list_of_touples_for_rating_groups_legend.append(touple_containing_name_and_associated_labels)
    legend_groups = Legend(items=list_of_touples_for_rating_groups_legend, orientation="vertical")

    # Wrote this out in order to make sure it was correct. The code can be a lot less
    scatterplots_monday_across_groups =      [s[i][0] for i in range(len((rating_groups)))]
    scatterplots_tuesday_across_groups =     [s[i][1] for i in range(len((rating_groups)))]
    scatterplots_wednesday_across_groups =   [s[i][2] for i in range(len((rating_groups)))]
    scatterplots_thursday_across_groups =    [s[i][3] for i in range(len((rating_groups)))]
    scatterplots_friday_across_groups =      [s[i][4] for i in range(len((rating_groups)))]
    scatterplots_saturday_across_groups =    [s[i][5] for i in range(len((rating_groups)))]
    scatterplots_sunday_across_groups =      [s[i][6] for i in range(len((rating_groups)))]

    days_across_groups = [scatterplots_monday_across_groups,
        scatterplots_tuesday_across_groups,
        scatterplots_wednesday_across_groups,
        scatterplots_thursday_across_groups,
        scatterplots_friday_across_groups,
        scatterplots_saturday_across_groups,
        scatterplots_sunday_across_groups]

    # Create separate legend to control visibility of all dots within a associated with a certain weekday
    list_of_touples_for_weekday_legend = []
    for j, day in enumerate(weekdays):
        touple_containing_name_and_associated_labels = (day, days_across_groups[j])
        list_of_touples_for_weekday_legend.append(touple_containing_name_and_associated_labels)
    legend_weekdays = Legend(items=list_of_touples_for_weekday_legend, orientation="vertical")

    # At the legends to layout
    fig.add_layout(legend_groups, 'right')
    fig.add_layout(legend_weekdays, 'right')

    # Make the legend interactive. Hide data of a certain legend item upon click.
    fig.legend.click_policy = "hide"

    # Show the plot
    show(fig)
show_scatter()

In [125]:
# Naive approach
import bokeh
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Colorblind 
from bokeh.models import Legend
import numpy as np

 
# Display Bokeh plots in the notebook
output_notebook()

fig = figure(title="Scatterplot Detailing Opening Hours of Businesses",
           width = 800, 
           x_axis_label="Hours of Opening", 
           y_axis_label="Number of Hours Restaurant Remains Open")

# Get the same amount of colors from the Colorblind palette as there are items in weekdays
colors = Colorblind[len(rating_groups)]

# Create a 2d array of size len(rating_groups)xlen(weekdays)
s = [[None for _ in range(len(weekdays))] for _ in range(len(rating_groups))]

for i, rating_group in enumerate(rating_groups):
    df = df_business[df_business["Rating_Group"] == rating_group]
    for j, day in enumerate(weekdays):
        scatter = fig.scatter(df[day + '_Hour_Of_Opening_Float'], 
            df[day + "_Open_Duration_Float"], 
            size = 7,
            color = colors[i], 
            alpha = 0.3,
            legend_label=rating_group + ", " + day)

# Make the legend interactive. Hide data of a certain legend item upon click.
fig.legend.click_policy = "hide"

# Show the plot
show(fig)

# Filter with Widget Experiment

In [126]:
# https://stackoverflow.com/questions/56526738/implementing-javascript-callback-for-checkboxes-in-bokeh

# https://docs.bokeh.org/en/2.4.0/docs/reference/plotting/figure.html?highlight=figure#bokeh.plotting.Figure.annular_wedge
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import Slider, CheckboxGroup, CustomJS, ColumnDataSource, CDSView
from bokeh.models.filters import CustomJSFilter
from bokeh.layouts import row
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10_10
output_notebook()
from bokeh.sampledata import iris
source = ColumnDataSource(data=iris.flowers)
species = iris.flowers.species.unique().tolist()
checkboxes = CheckboxGroup(labels=species, active=list(range(len(species))))
fig = figure()
filter =  CustomJSFilter(code="""
let selected = checkboxes.active.map(i=>checkboxes.labels[i]);
let indices = [];
let column = source.data.species;
for(let i=0; i<column.length; i++){
    if(selected.includes(column[i])){
        indices.push(i);
    }
}
return indices;
""", args=dict(checkboxes=checkboxes, source=source))

checkboxes.js_on_change("active", CustomJS(code="source.change.emit();", args=dict(source=source)))

fig.scatter("sepal_length", "sepal_width", 
            color=factor_cmap("species", Category10_10, species),
            source=source, view=CDSView(source=source, filters=[filter]))
show(row(checkboxes, fig))

ValueError: failed to validate CustomJSFilter(id='p11286', ...).args: Disallowed keys: {'source'}

# Filter with widget experiment in scatterplot with alien dataset

In [None]:
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, CustomJS, Select
from bokeh.plotting import figure, save, output_file
from pandas_datareader import wb

# Download data
df = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008)
df = df.reset_index()

# Set up data sources
source = ColumnDataSource(df)
original_source = ColumnDataSource(df)

# Define fixed axis ranges based on the full dataset
x_min, x_max = df['year'].min(), df['year'].max()
y_min, y_max = df['NY.GDP.PCAP.KD'].min(), df['NY.GDP.PCAP.KD'].max()

# Create the scatter plot figure with fixed axis ranges
fig = figure(
    title="GDP per Capita over Years",
    x_axis_label="Year",
    y_axis_label="GDP per Capita (NY.GDP.PCAP.KD)",
    x_range=(x_min, x_max),
    y_range=(y_min, y_max)
)
scatter = fig.scatter(x="year", y="NY.GDP.PCAP.KD", source=source, size=10, color="navy", alpha=0.5)

# Callback code for filtering
combined_callback_code = """
var data = source.data;
var original_data = original_source.data;
var country = country_select_obj.value;
console.log("country: " + country);
var year = year_select_obj.value;
console.log("year: " + year);
for (var key in original_data) {
    data[key] = [];
    for (var i = 0; i < original_data['country'].length; ++i) {
        if ((country === "ALL" || original_data['country'][i] === country) &&
            (year === "ALL" || original_data['year'][i] == year)) {
            data[key].push(original_data[key][i]);
        }
    }
}
source.change.emit();
"""

# Define filter widgets
country_list = ['ALL'] + df['country'].unique().tolist()
country_select = Select(title="Country:", value=country_list[0], options=country_list)
year_list = ['ALL'] + df['year'].unique().tolist()
year_select = Select(title="Year:", value=year_list[0], options=year_list)

# Define callback with the plot as the target object
generic_callback = CustomJS(
    args=dict(source=source, 
              original_source=original_source, 
              country_select_obj=country_select, 
              year_select_obj=year_select),
    code=combined_callback_code
)

# Connect callbacks to filter widgets
country_select.js_on_change('value', generic_callback)
year_select.js_on_change('value', generic_callback)

# Layout and output
layout = column(country_select, year_select, fig)
output_file('scatterplot_filter_fixed_axes.html')
save(layout)


  df = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008)


'c:\\Users\\andre\\Documents\\Computer Science Masters\\Data Visualization\\Semester-project\\GitHub\\datavis-group24\\andreas-experiments\\scatterplot_filter_fixed_axes.html'

# Filter by Widget Scatterplot with Our Dataset

In [None]:
# https://stackoverflow.com/questions/56526738/implementing-javascript-callback-for-checkboxes-in-bokeh
# https://docs.bokeh.org/en/2.4.0/docs/reference/plotting/figure.html?highlight=figure#bokeh.plotting.Figure.annular_wedge

from bokeh.io import show
from bokeh.models import CheckboxGroup, CustomJS

checkbox_group_rating_groups = CheckboxGroup(labels=rating_groups)
checkbox_group_rating_groups.js_on_change('active', CustomJS(code="""
    console.log('checkbox_group_rating_groups: active=' + this.active, this.toString())
"""))
show(checkbox_group_rating_groups)

checkbox_group_weekdays = CheckboxGroup(labels=weekdays)
checkbox_group_weekdays.js_on_change('active', CustomJS(code="""
    console.log('checkbox_group_weekdays: active=' + this.active, this.toString())
"""))
show(checkbox_group_weekdays)




In [None]:
df_business.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'hours_Monday', 'hours_Tuesday', 'hours_Wednesday',
       'hours_Thursday', 'hours_Friday', 'hours_Saturday', 'hours_Sunday',
       'category_of_interest', 'Rating_Group', 'Monday_Hour_Of_Opening_Float',
       'Monday_Open_Duration_Float', 'Tuesday_Hour_Of_Opening_Float',
       'Tuesday_Open_Duration_Float', 'Wednesday_Hour_Of_Opening_Float',
       'Wednesday_Open_Duration_Float', 'Thursday_Hour_Of_Opening_Float',
       'Thursday_Open_Duration_Float', 'Friday_Hour_Of_Opening_Float',
       'Friday_Open_Duration_Float', 'Saturday_Hour_Of_Opening_Float',
       'Saturday_Open_Duration_Float', 'Sunday_Hour_Of_Opening_Float',
       'Sunday_Open_Duration_Float'],
      dtype='object')

In [None]:
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, CustomJS, Select
from bokeh.plotting import figure, save, output_file
from pandas_datareader import wb

# Download data
# df = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008)
# df = df.reset_index()

# Set up data sources
source = ColumnDataSource(df_business)
original_source = ColumnDataSource(df_business)

# Define fixed axis ranges based on the full dataset
x_min, x_max = 0, 24
y_min, y_max = 0, 16

# Define colors
colors = Colorblind[len(rating_groups)]

# Create the scatter plot figure with fixed axis ranges
fig = figure(
    title="GDP per Capita over Years",
    x_axis_label="Opening Hour",
    y_axis_label="Duration",
    x_range=(x_min, x_max),
    y_range=(y_min, y_max)
)

for i, rating_group in enumerate(rating_groups):
    for j, weekday in enumerate(weekdays):
        fig.scatter(x=weekday + "_Opening_Hours_Float", y=weekday + "_Open_Duration_Float", 
                    source=source, size=7, color=colors[i], alpha=0.5)


# Callback code for filtering
combined_callback_code = """
var data = source.data;
var original_data = original_source.data;
var country = country_select_obj.value;
console.log("country: " + country);
var year = year_select_obj.value;
console.log("year: " + year);
for (var key in original_data) {
    data[key] = [];
    for (var i = 0; i < original_data['country'].length; ++i) {
        if ((country === "ALL" || original_data['country'][i] === country) &&
            (year === "ALL" || original_data['year'][i] == year)) {
            data[key].push(original_data[key][i]);
        }
    }
}
source.change.emit();
"""

# Define filter widgets
country_list = ['ALL'] + df_business['country'].unique().tolist()
country_select = Select(title="Country:", value=country_list[0], options=country_list)
year_list = ['ALL'] + df_business['year'].unique().tolist()
year_select = Select(title="Year:", value=year_list[0], options=year_list)

# Define callback with the plot as the target object
generic_callback = CustomJS(
    args=dict(source=source, 
              original_source=original_source, 
              country_select_obj=country_select, 
              year_select_obj=year_select),
    code=combined_callback_code
)

# Connect callbacks to filter widgets
country_select.js_on_change('value', generic_callback)
year_select.js_on_change('value', generic_callback)

# Layout and output
layout = column(country_select, year_select, fig)
output_file('scatterplot_filter_fixed_axes.html')
save(layout)


  df = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008)


'c:\\Users\\andre\\Documents\\Computer Science Masters\\Data Visualization\\Semester-project\\GitHub\\datavis-group24\\andreas-experiments\\scatterplot_filter_fixed_axes.html'

In [None]:
from bokeh.io import output_file, save
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, CustomJS, CheckboxGroup
from bokeh.plotting import figure
import pandas as pd

# Sample data for testing
data = {
    "business_id": [1, 2, 3],
    "Rating_Group": ["Rating 1-2", "Rating 2-3", "Rating 4-5"],
    "Monday_Hour_Of_Opening_Float": [8.0, 9.0, 8.5],
    "Monday_Open_Duration_Float": [10.0, 11.0, 9.5],
    "Tuesday_Hour_Of_Opening_Float": [8.0, 9.5, 8.0],
    "Tuesday_Open_Duration_Float": [10.5, 11.0, 10.0]
    # Continue this pattern for all days
}
df = pd.DataFrame(df_business)

# Reshape data to have columns 'day', 'Hour_Of_Opening', 'Open_Duration'
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
reshaped_data = []
for day in days:
    reshaped_data.append(
        df[["business_id", "Rating_Group"]].assign(
            day=day,
            Hour_Of_Opening=df.get(f"{day}_Hour_Of_Opening_Float"),
            Open_Duration=df.get(f"{day}_Open_Duration_Float")
        )
    )
reshaped_df = pd.concat(reshaped_data, ignore_index=True)

# Initialize data sources
source = ColumnDataSource(reshaped_df)
original_source = ColumnDataSource(reshaped_df)

# Create figure
fig = figure(title="Opening Hour vs. Duration by Day of Week and Rating Group",
           x_axis_label="Hour of Opening", y_axis_label="Open Duration",
           x_range = (0,24), y_range = (0,24))
colors = ["red", "blue", "green", "orange", "purple", "brown", "gray"]
for i, day in enumerate(days):
    fig.circle(x="Hour_Of_Opening", y="Open_Duration", source=source,
             color=colors[i], legend_label=day, alpha=0.6, size=8)

# Create checkboxes for days and rating groups
day_checkboxes = CheckboxGroup(labels=days, active=list(range(len(days))))  # All days active initially
rating_groups = reshaped_df["Rating_Group"].unique().tolist()
rating_checkboxes = CheckboxGroup(labels=rating_groups, active=list(range(len(rating_groups))))

# JavaScript code for the callback
callback_code = """
var data = source.data;
var original_data = original_source.data;
var selected_days = day_checkboxes_obj.active.map(i => day_checkboxes_obj.labels[i]);
var selected_ratings = rating_checkboxes_obj.active.map(i => rating_checkboxes_obj.labels[i]);

// Reset the data arrays
for (var key in data) {
    data[key] = [];
}

// Filter data based on selected days and rating groups
for (var i = 0; i < original_data['day'].length; i++) {
    if (selected_days.includes(original_data['day'][i]) && selected_ratings.includes(original_data['Rating_Group'][i])) {
        for (var key in data) {
            data[key].push(original_data[key][i]);
        }
    }
}

source.change.emit();
"""

# Set up callback with the data sources and checkboxes
callback = CustomJS(args=dict(source=source, 
                              original_source=original_source,
                              day_checkboxes_obj=day_checkboxes, 
                              rating_checkboxes_obj=rating_checkboxes),
                    code=callback_code)

# Attach callback to checkboxes
day_checkboxes.js_on_change("active", callback)
rating_checkboxes.js_on_change("active", callback)

# Layout
layout = column(row(day_checkboxes, rating_checkboxes), fig)
#output_file("scatterplot_filter.html")
#save(layout)
output_notebook()




ValueError: failed to validate CheckboxGroup(id='p2830', ...).labels: expected an element of List(String), got seq with invalid items [nan]