# Bokeh Line Chart

In [None]:
from bokeh.io import output_notebook

output_notebook()

In [None]:
# Import data
import pandas as pd

df_business = pd.read_csv("../data/cleaned_businessV2.csv")
df_review = pd.read_csv("../data/crosslisted_reviews.csv")

# Rename "stars", so we know where each column come from
df_business.rename(columns={"stars": "business_stars"}, inplace=True)
df_review.rename(columns={"stars": "review_stars"}, inplace=True)

# Join the two tables
df_joined = pd.merge(df_business, df_review, on="business_id", how="inner")
# Convert column types to string
df_joined = df_joined.convert_dtypes()
df_joined["review_stars"] = df_joined["review_stars"].astype(float)

# Check for NaN
# print(df_joined.isna().sum())

# Drop columns with NaN values in address and postal code.
# It turned out that the rows without postal codes also did not have addresses
df_joined.dropna(subset="address", inplace=True)
print(df_joined.isna().sum())

In [None]:
# Define kinds of restaurants we are interested in. May need to delete this later
# to allow the user to define this with UI
categories_of_interest = ["Chinese", "Japanese", "Italian", "Polish", "Scandinavian"]

# Create new column containing a specific category of interest.
# If not in interest, label the column value "Other"
df_joined["category_of_interest"] = "Other"
for item in categories_of_interest:
    df_joined.loc[
        df_joined["categories"].str.contains(item), "category_of_interest"
    ] = item

# Extract the year from the date time string.
df_joined["Year"] = df_joined["date"].apply(lambda x: x.split("-")[0])

df_grouped = df_joined.groupby(["category_of_interest", "Year"])["review_stars"].mean()
df_grouped.head(20)

In [None]:
from bokeh.plotting import figure, show
from bokeh.palettes import Colorblind

# Define the amount of "air" around the borders of our plot
margin = 0.5

# Define the borders for our plot as the min and max
# values of the data under analysis plus/minus the margin
min_x_val = df_joined["Year"].astype(int).min()
max_x_val = df_joined["Year"].astype(int).max()
min_y_val = df_grouped[categories_of_interest].astype(float).min() - margin
max_y_val = df_grouped[categories_of_interest].astype(float).max() + margin

# create a new plot with a title and axis labels
p = figure(
    title="Average Rating Throughout the Years by Category",
    x_axis_label="Year",
    y_axis_label="Average Rating",
    x_range=(min_x_val, max_x_val),
    y_range=(min_y_val, max_y_val),
)

# Get a list of color-blind friendly colors of length = len(categories_of_interest)
colors = Colorblind[len(categories_of_interest)]

# Make a line for each category
for i, category in enumerate(categories_of_interest):
    x = df_grouped[category].index.tolist()
    y = df_grouped[category].tolist()
    p.line(x, y, legend_label=category, line_width=2, color=colors[i])

# Move legend outside the plot
p.add_layout(p.legend[0], "right")

# Hide line when clicked on its legend item
p.legend.click_policy = "hide"

# show the results
show(p)

In [None]:
from bokeh.plotting import figure, show
from bokeh.palettes import Colorblind

# Define the amount of "air" around the borders of our plot
margin = 0.5

# Define the borders for our plot as the min and max
# values of the data under analysis plus/minus the margin
min_x_val = df_joined["Year"].astype(int).min()
max_x_val = df_joined["Year"].astype(int).max()

# Calculate the moving average for each category
window_size = 3  # You can adjust the window size as needed
df_moving_avg = (
    df_joined.groupby(["category_of_interest", "Year"])["review_stars"]
    .mean()
    .groupby(level=0)
    .rolling(window=window_size)
    .mean()
    .reset_index()
)

# Create a new plot with a title and axis labels
p = figure(
    title="Average Rating Throughout the Years by Category (Moving Average)",
    x_axis_label="Year",
    y_axis_label="Average Rating",
    x_range=(min_x_val, max_x_val),
)

# Get a list of color-blind friendly colors of length = len(categories_of_interest)
colors = Colorblind[len(categories_of_interest)]

# Make a line for each category
for i, category in enumerate(categories_of_interest):
    category_data = df_moving_avg[df_moving_avg["category_of_interest"] == category]
    x = category_data["Year"].astype(int).tolist()
    y = category_data["review_stars"].tolist()
    p.line(x, y, legend_label=category, line_width=2, color=colors[i])

# Move legend outside the plot
p.add_layout(p.legend[0], "right")

# Hide line when clicked on its legend item
p.legend.click_policy = "hide"

# Show the results
show(p)

In [None]:
import pandas as pd
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show
from scipy.signal import windows

categories = ["Burger", "Chinese", "Mexican", "Italian", "Thai"]

# Import data
df_business = pd.read_csv("../data/cleaned_businessV2.csv")
df_review = pd.read_csv("../data/crosslisted_reviews.csv")
df_review.rename(columns={"stars": "review_stars"}, inplace=True)
df_joined = pd.merge(df_business, df_review, on="business_id", how="inner")

df_joined["category_of_interest"] = "Other"
for item in categories:
    df_joined.loc[
        df_joined["categories"].str.contains(item, na=False), "category_of_interest"
    ] = item


df_joined = df_joined[["review_stars", "date", "category_of_interest"]]
df_joined.head()
df_joined["date"] = pd.to_datetime(df_joined["date"])
df_sample = df_joined.sample(10000, random_state=42)
df_sample = df_sample.sort_values("date")

df_pivot = df_sample.pivot_table(
    index="date", columns="category_of_interest", values="review_stars", aggfunc="mean"
)
df_resampled = df_pivot.resample("D").mean()
# df_rolling = df_resampled.rolling(365, min_periods=1, win_type="triang").mean()
df_rolling = df_resampled.rolling(365, min_periods=1, win_type="triang").mean()

df_rolling.tail()

# sampled = df_sample.resample("D", on="date").mean()
# rolling = sampled.rolling(365, min_periods=1, win_type="triang").mean()
# rolling.head()

In [None]:
source = ColumnDataSource(df_rolling)
p = figure(
    title="Average Rating Throughout the Years by Category",
    x_axis_label="Year",
    y_axis_label="Average Rating",
    x_axis_type="datetime",
)
colors = ["blue", "green", "red", "orange", "purple"]

for i, category in enumerate(categories):
    if category in df_rolling.columns:
        p.line(
            df_rolling.index,
            df_rolling[category],
            legend_label=category,
            color=colors[i],
        )
print(df_joined.dtypes)
show(p)