In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import json

# Read in the restaurants and their health scores from the SF city government
input_data_file = "Restaurant_Scores_-_LIVES_Standard.csv"
health_df = pd.read_csv(input_data_file)

In [2]:
# Shorten the dataframe down to just the columns we need:
health_sm = health_df[["business_id", "business_name", "business_address", "inspection_score", "risk_category"]]
health_sm.head()

Unnamed: 0,business_id,business_name,business_address,inspection_score,risk_category
0,67448,Soo Fong Restaurant,3801 03rd St 370,92.0,Low Risk
1,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,86.0,Moderate Risk
2,93022,Wise Sons Delicatessen,537 Octavia St,92.0,Moderate Risk
3,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,84.0,Low Risk
4,79782,Deli 23,2449 23rd St,92.0,Moderate Risk


In [3]:
# We don't need any of the data without inspection scores so drop all of those rows
health_sm = health_sm.dropna(subset=['inspection_score'])
health_sm.count()


business_id         39808
business_name       39808
business_address    39808
inspection_score    39808
risk_category       37804
dtype: int64

In [4]:
# Group by the unique "business_id" for each restaurant
health_group = health_sm.groupby("business_id")

In [6]:
# Fill out the dataframe to written to file for the Yelp API:
# We will use the latest inspection score, plus average and min and max score just in case we want them later.
resto_df = pd.DataFrame(health_group.last())
resto_df = resto_df.rename(columns={"inspection_score":"recent_score", "risk_category":"recent_risk"})
resto_df["average_score"] = health_group["inspection_score"].mean()
resto_df["minimum_score"] = health_group["inspection_score"].min()
resto_df["maximum_score"] = health_group["inspection_score"].max()
resto_df.head()

Unnamed: 0_level_0,business_name,business_address,recent_score,recent_risk,average_score,minimum_score,maximum_score
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19,Nrgize Lifestyle Cafe,"1200 Van Ness Ave, 3rd Floor",96.0,Moderate Risk,94.333333,94.0,96.0
24,OMNI S.F. Hotel - 2nd Floor Pantry,"500 California St, 2nd Floor",98.0,Low Risk,98.0,98.0,98.0
31,Norman's Ice Cream and Freezes,2801 Leavenworth St,96.0,Low Risk,96.0,96.0,96.0
45,CHARLIE'S DELI CAFE,3202 FOLSOM St,88.0,Low Risk,86.315789,84.0,88.0
48,ART'S CAFE,747 IRVING St,94.0,Low Risk,90.5,87.0,94.0


In [7]:
# Export file as a CSV, without the business_id which is useless now, but with the header
resto_df.to_csv("restaurants.csv", index=False, header=True)