# Project B


This notebook will an exploratory data analysis of the chosen dataset.

## Packages Needed

In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import zipfile

from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, FactorRange, Legend, HoverTool
from bokeh.palettes import Category20c, Pastel1
from bokeh.layouts import column, row, widgetbox, gridplot

output_notebook()

## Load Data Business Dataset + Prep

In [105]:
df = pd.read_csv('yelp_reviews_RV_categories.csv', 
                 usecols=['name','address','city','state','latitude', 'longitude','stars',
                         'attributes','categories','hours','text','date','cat_kitchen','cat_type'])
print(df.shape)

(5177322, 14)


In [106]:
df['date'] = pd.to_datetime(df['date'])

In [107]:
df['cat_type'].value_counts()

Nightlife             997131
Bars                  974350
Food                  684773
Breakfast & Brunch    497283
Sandwiches            266517
Burgers               244596
Sushi Bars            216243
Pizza                 203438
Salad                 180963
Wine Bars             151418
Vegetarian            149865
Steakhouses           137014
Coffee & Tea          117560
Diners                106299
Cafes                 105476
Fast Food             100563
Buffets                43833
Name: cat_type, dtype: int64

## Plots for the Overview page

### General plots - Michelle

In [108]:
restaurants = df.drop_duplicates(subset='name')
restaurants.shape

(8688, 14)

In [109]:
group_state_ = restaurants.groupby(['cat_kitchen']).size().reset_index(name='Counts').sort_values(by='Counts',ascending=False)
group_state_['Type'] = group_state_['cat_kitchen'].astype(str)
df_dict_ = group_state_.to_dict('list')

In [110]:
group_state__ = restaurants.groupby(['cat_type']).size().reset_index(name='Counts').sort_values(by='Counts',ascending=False)
group_state__['Type'] = group_state__['cat_type'].astype(str)
df_dict__ = group_state__.to_dict('list')

In [111]:
group_state = restaurants.groupby(['state']).size().reset_index(name='Counts').sort_values(by='Counts',ascending=False)
group_state['state'] = group_state['state'].astype(str)
df_dict = group_state.to_dict('list')

In [112]:
title = 'Restaurants by state'
xlabel = 'State'
range_x = group_state.state.unique().tolist()

plot1 = figure(x_range=FactorRange(factors=range_x), plot_width=600, plot_height=300,
               x_axis_label=xlabel, toolbar_location=None, title=title)
plot1.vbar(x='state', width=0.7, bottom=0,
           top='Counts', source=df_dict)

# hover tool
plot1.add_tools(HoverTool(tooltips=[('Count', "@Counts{1}")]))

# axis ticks
plot1.xaxis.major_tick_line_color = None 
plot1.xaxis.minor_tick_line_color = None 
plot1.yaxis.major_tick_line_color = None  
plot1.yaxis.minor_tick_line_color = None  
plot1.title.text_font_size = '13pt'
plot1.title.align = 'center'

show(plot1)

In [113]:
title = 'Restaurants by kitchen'
range_x = group_state_['cat_kitchen'].unique().tolist()
xlabel = 'Kitchen type'

plot2 = figure(x_range=FactorRange(factors=range_x), plot_width=600, plot_height=500,
               x_axis_label=xlabel, toolbar_location=None, title=title)
plot2.vbar(x='Type', width=0.7, bottom=0,
           top='Counts', source=df_dict_)

# hover tool
plot2.add_tools(HoverTool(tooltips=[('Count', "@Counts")]))

# axis ticks
plot2.xaxis.major_tick_line_color = None 
plot2.xaxis.minor_tick_line_color = None 
plot2.yaxis.major_tick_line_color = None  
plot2.yaxis.minor_tick_line_color = None  
plot2.xaxis.major_label_orientation = "vertical"
plot2.title.text_font_size = '13pt'
plot2.title.align = 'center'

show(plot2)

In [114]:
title = 'Restaurants by Categories'
range_x = group_state__['cat_type'].unique().tolist()
xlabel = 'Categories'

plot3 = figure(x_range=FactorRange(factors=range_x), plot_width=600, plot_height=500,
               x_axis_label=xlabel, toolbar_location=None, title=title)
plot3.vbar(x='Type', width=0.7, bottom=0,
           top='Counts', source=df_dict__)

# hover tool
plot3.add_tools(HoverTool(tooltips=[('Count', "@Counts")]))

# axis ticks
plot3.xaxis.major_tick_line_color = None 
plot3.xaxis.minor_tick_line_color = None 
plot3.yaxis.major_tick_line_color = None  
plot3.yaxis.minor_tick_line_color = None  
plot3.xaxis.major_label_orientation = "vertical"
plot3.title.text_font_size = '13pt'
plot3.title.align = 'center'

show(plot3)

### Score plots - Michelle

### Time plots - Julius

**For hver time, hver dag**

In [125]:
sorter = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
day_hour = df.groupby([df.date.dt.day_name(), df['date'].dt.hour])['stars'].count().reindex(sorter,level=0)
day_hour.index.names = ["Weekday", "Hour of the day"]
day_hour = day_hour.unstack(0)

fig = px.line(day_hour, x=day_hour.index, y=day_hour.columns, width=1000, height=600)
fig.update_traces(hovertemplate=None)
fig.update_layout(title_text="Total number of reviews during the day", 
                  #xaxis_title="...", 
                  yaxis_title="Number of reviews",
                  hovermode="x unified")
fig.show()

**Hver time, hver category**

In [126]:
cat_hour = df.groupby(['cat_type', df['date'].dt.hour])['stars'].count()
cat_hour.index.names = ["Category", "Hour of the day"]
cat_hour = cat_hour.unstack(0)

fig = px.line(cat_hour, x=cat_hour.index, y=cat_hour.columns, width=1000, height=600)
fig.update_traces(hovertemplate=None)
fig.update_layout(title_text="Review times for each focus category", 
                  yaxis_title="Number of reviews",
                  hovermode="x unified")
fig.show()

**Hvert år - bar**

In [94]:
years = pd.DataFrame(df['date'].dt.year.value_counts()).reset_index()\
        .rename(columns={"index":"Year","date": "Review count"}).sort_index()
fig = px.bar(years, x=years.Year, y=years['Review count'])
fig.update_traces(hovertemplate=None)
fig.update_layout(title_text="Number of reviews each year", 
                  xaxis_title="Year", 
                  yaxis_title="Number of reviews",
                  hovermode="x unified")
fig.show()

**Hver måned, hvert år** se 2020 :o

In [102]:
sorter = ['January', 'February', 'March', 'April', 'May', 'June','July',
          'August','September','October','November','December']

day_hour = df[df.date.dt.year != 2021].groupby([df['date'].dt.year, df.date.dt.month_name()])['stars'].count().reindex(sorter,level=1)
day_hour.index.names = ["Year", "Month"]
day_hour = day_hour.unstack(0)

fig = px.line(day_hour, x=day_hour.index, y=day_hour.columns)
fig.update_traces(hovertemplate=None)
fig.update_layout(title_text="Total number of reviews each month in the different years", 
                  #xaxis_title="...", 
                  yaxis_title="Number of reviews",
                  hovermode="x unified")
fig.show()