# Bokeh Line Chart

In [None]:
# Import data
import pandas as pd
df_business = pd.read_csv("../data/cleaned_businessV2.csv")
df_review = pd.read_csv("../data/crosslisted_reviews.csv")

# Rename "stars", so we know where each column come from
df_business.rename(columns={'stars' : 'business_stars'}, inplace=True)
df_review.rename(columns={'stars' : 'review_stars'}, inplace=True)

# Join the two tables
df_joined = pd.merge(df_business, df_review, on="business_id", how="inner")
# Convert column types to string
df_joined = df_joined.convert_dtypes()
df_joined["review_stars"] = df_joined["review_stars"].astype(float)

# Check for NaN
#print(df_joined.isna().sum())

# Drop columns with NaN values in address and postal code. 
# It turned out that the rows without postal codes also did not have addresses 
df_joined.dropna(subset="address", inplace=True)
print(df_joined.isna().sum())


business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
business_stars     0
review_count       0
is_open            0
categories         0
hours_Monday       0
hours_Tuesday      0
hours_Wednesday    0
hours_Thursday     0
hours_Friday       0
hours_Saturday     0
hours_Sunday       0
review_id          0
review_stars       0
date               0
dtype: int64


In [130]:
# Define kinds of restaurants we are interested in. May need to delete this later
# to allow the user to define this with UI
categories_of_interest = ['Chinese', 'Japanese', 'Italian', 'Polish', 'Scandinavian']

# Create new column containing a specific category of interest. 
# If not in interest, label the column value "Other"
df_joined['category_of_interest'] = "Other"
for item in categories_of_interest:
    df_joined.loc[df_joined['categories'].str.contains(item), 'category_of_interest'] = item

# Extract the year from the date time string.
df_joined["Year"] = df_joined["date"].apply(lambda x: x.split("-")[0])

df_grouped = df_joined.groupby(['category_of_interest', "Year"])["review_stars"].mean()
df_grouped.head(20)


category_of_interest  Year
Chinese               2005    4.000000
                      2006    4.058824
                      2007    3.923387
                      2008    3.830046
                      2009    3.745217
                      2010    3.776471
                      2011    3.733413
                      2012    3.800978
                      2013    3.781028
                      2014    3.876858
                      2015    3.848823
                      2016    3.817176
                      2017    3.864823
                      2018    3.893726
                      2019    3.870541
                      2020    3.902753
                      2021    4.017069
                      2022    3.981250
Italian               2005    3.944444
                      2006    4.024390
Name: review_stars, dtype: float64

In [None]:
from bokeh.plotting import figure, show
from bokeh.palettes import Colorblind

# Define the amount of "air" around the borders of our plot
margin = 0.5

# Define the borders for our plot as the min and max 
# values of the data under analysis plus/minus the margin
min_x_val = df_joined["Year"].astype(int).min()
max_x_val = df_joined["Year"].astype(int).max()
min_y_val = df_grouped[categories_of_interest].astype(float).min() - margin
max_y_val = df_grouped[categories_of_interest].astype(float).max() + margin

# create a new plot with a title and axis labels
p = figure(title="Average Rating Throughout the Years by Category", 
           x_axis_label='Year', 
           y_axis_label='Average Rating',
           x_range = (min_x_val, max_x_val),
           y_range = (min_y_val, max_y_val))

# Get a list of color-blind friendly colors of length = len(categories_of_interest) 
colors = Colorblind[len(categories_of_interest)]

# Make a line for each category
for i, category in enumerate(categories_of_interest):
    x = df_grouped[category].index.tolist()
    y = df_grouped[category].tolist()
    p.line(x, y, legend_label=category, line_width=2, color = colors[i])

# Move legend outside the plot
p.add_layout(p.legend[0], 'right')

# Hide line when clicked on its legend item
p.legend.click_policy = "hide"

# show the results
show(p)