In [9]:
# Import data
import pandas as pd
df_business = pd.read_csv("../data/cleaned_businessV2.csv")



# Prepare

In [10]:
# Helper functions
from datetime import datetime

def get_opening_float(time_interval):
    opening_time = time_interval.split("-")
    opening_hour, opening_minute = opening_time[0].split(":")
    opening_time_float = float(opening_hour) + float(opening_minute) / 60.0
    return opening_time_float

def get_open_duration_float(time_interval):
    # Split the string into start and end times
    start_time_str, end_time_str = time_interval.split('-')
    
    # Convert the strings to datetime objects
    start_time = datetime.strptime(start_time_str, "%H:%M")
    end_time = datetime.strptime(end_time_str, "%H:%M")
    
    # Calculate the difference in hours and return as a float
    time_difference = end_time - start_time
    hours = time_difference.total_seconds() / 3600
    
    return abs(hours)

test_time_interval = "18:30-17:45"
test_float = get_opening_float(test_time_interval)
test_duration = get_open_duration_float(test_time_interval)

print(test_float)
print(test_duration, type(test_duration))

# Handle NaN values
#print(f"df_business.isna().sum(): \n{df_business.isna().sum()}")
df_business = df_business.dropna(subset=["address"])
#print(f"df_business.isna().sum() after drop: \n{df_business.isna().sum()}")

18.5
0.75 <class 'float'>


In [11]:
# Define kinds of restaurants we are interested in. May need to delete this later
# to allow the user to define this with UI
categories_of_interest = ['Chinese', 'Japanese', 'Italian', 'Polish', 'Scandinavian']

# Convert column types to string
df_business = df_business.convert_dtypes()
#print(f"df_business.dtypes: \n{df_business.dtypes}")

# Create new column containing a specific category of interest. 
# If not in interest, label the column value "Other"
df_business['category_of_interest'] = "Other"
for item in categories_of_interest:
    df_business.loc[df_business['categories'].str.contains(item), 'category_of_interest'] = item

# Define the days of the weeks for iteration
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
rating_groups = ["Rating 1-2", "Rating 2-3", "Rating 3-4", "Rating 4-5"]

df_business["Rating_Group"] = pd.cut(df_business["stars"], bins=[1,2,3,4,5], labels=rating_groups)

for day in weekdays:
    # Drop the columns where shops are closed
    df_business = df_business[df_business["hours_" + day] != "Closed"]
    # Create columns for our x-values
    df_business[day + "_Hour_Of_Opening_Float"] = df_business["hours_" + day ].apply(get_opening_float)
    # Create columns for our y-values
    df_business[day + "_Open_Duration_Float"] = df_business["hours_" + day].apply(get_open_duration_float)

df_business.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,Wednesday_Hour_Of_Opening_Float,Wednesday_Open_Duration_Float,Thursday_Hour_Of_Opening_Float,Thursday_Open_Duration_Float,Friday_Hour_Of_Opening_Float,Friday_Open_Duration_Float,Saturday_Hour_Of_Opening_Float,Saturday_Open_Duration_Float,Sunday_Hour_Of_Opening_Float,Sunday_Open_Duration_Float
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,7.0,13.0,7.0,13.0,7.0,14.0,7.0,14.0,7.0,14.0
3,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4.0,65,...,16.0,16.0,12.0,12.0,12.0,10.0,11.0,9.0,11.0,11.0
9,O1oZpbZNDMH_gz8DhsZCdA,Wendy's,700 E. Hunting Park,Philadelphia,PA,19124,40.012141,-75.115015,1.5,15,...,10.0,8.0,10.0,8.0,10.0,8.0,10.0,8.0,10.0,8.0


In [92]:
import bokeh
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Colorblind 
from bokeh.models import Legend
import numpy as np

 
# Display Bokeh plots in the notebook
output_notebook()

fig = figure(title="Scatterplot Detailing Opening Hours of Businesses",
           width = 800, 
           x_axis_label="Hours of Opening", 
           y_axis_label="Number of Hours Restaurant Remains Open")

# Get the same amount of colors from the Colorblind palette as there are items in weekdays
colors = Colorblind[len(rating_groups)]

# Create a 2d array of size len(rating_groups)xlen(weekdays)
s = [[None for _ in range(len(weekdays))] for _ in range(len(rating_groups))]

for i, rating_group in enumerate(rating_groups):
    df = df_business[df_business["Rating_Group"] == rating_group]
    for j, day in enumerate(weekdays):
        scatter = fig.scatter(df[day + '_Hour_Of_Opening_Float'], 
            df[day + "_Open_Duration_Float"], 
            size = 7,
            color = colors[i], 
            alpha = 0.3)
        scatter.name = rating_group + ", " + day # For sanity checking that the indices are correct. These can be printed.
        s[i][j] = scatter

# Create separate legend to control visibility of all dots within a rating group
list_of_touples_for_rating_groups_legend = []
for i, rating_group in enumerate(rating_groups):
    touple_containing_name_and_associated_labels = (rating_group, s[i])
    list_of_touples_for_rating_groups_legend.append(touple_containing_name_and_associated_labels)
legend_groups = Legend(items=list_of_touples_for_rating_groups_legend, orientation="vertical")

# Wrote this out in order to make sure it was correct. The code can be a lot less
scatterplots_monday_across_groups =      [s[i][0] for i in range(len((rating_groups)))]
scatterplots_tuesday_across_groups =     [s[i][1] for i in range(len((rating_groups)))]
scatterplots_wednesday_across_groups =   [s[i][2] for i in range(len((rating_groups)))]
scatterplots_thursday_across_groups =    [s[i][3] for i in range(len((rating_groups)))]
scatterplots_friday_across_groups =      [s[i][4] for i in range(len((rating_groups)))]
scatterplots_saturday_across_groups =    [s[i][5] for i in range(len((rating_groups)))]
scatterplots_sunday_across_groups =      [s[i][6] for i in range(len((rating_groups)))]

days_across_groups = [scatterplots_monday_across_groups,
    scatterplots_tuesday_across_groups,
    scatterplots_wednesday_across_groups,
    scatterplots_thursday_across_groups,
    scatterplots_friday_across_groups,
    scatterplots_saturday_across_groups,
    scatterplots_sunday_across_groups]

# Create separate legend to control visibility of all dots within a associated with a certain weekday
list_of_touples_for_weekday_legend = []
for j, day in enumerate(weekdays):
    touple_containing_name_and_associated_labels = (day, days_across_groups[j])
    list_of_touples_for_weekday_legend.append(touple_containing_name_and_associated_labels)
legend_weekdays = Legend(items=list_of_touples_for_weekday_legend, orientation="vertical")

# At the legends to layout
fig.add_layout(legend_groups, 'right')
fig.add_layout(legend_weekdays, 'right')

# Make the legend interactive. Hide data of a certain legend item upon click.
fig.legend.click_policy = "hide"

# Show the plot
show(fig)

In [None]:
# Naive approach
import bokeh
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Colorblind 
from bokeh.models import Legend
import numpy as np

 
# Display Bokeh plots in the notebook
output_notebook()

fig = figure(title="Scatterplot Detailing Opening Hours of Businesses",
           width = 800, 
           x_axis_label="Hours of Opening", 
           y_axis_label="Number of Hours Restaurant Remains Open")

# Get the same amount of colors from the Colorblind palette as there are items in weekdays
colors = Colorblind[len(rating_groups)]

# Create a 2d array of size len(rating_groups)xlen(weekdays)
s = [[None for _ in range(len(weekdays))] for _ in range(len(rating_groups))]

for i, rating_group in enumerate(rating_groups):
    df = df_business[df_business["Rating_Group"] == rating_group]
    for j, day in enumerate(weekdays):
        scatter = fig.scatter(df[day + '_Hour_Of_Opening_Float'], 
            df[day + "_Open_Duration_Float"], 
            size = 7,
            color = colors[i], 
            alpha = 0.3,
            legend_label=rating_group + ", " + day)

# Make the legend interactive. Hide data of a certain legend item upon click.
fig.legend.click_policy = "hide"

# Show the plot
show(fig)