## **Michelin Star Restaurants**

## **Imports**

In [1]:
#Numpy
import numpy as np

#Pandas
import pandas as pd

#Seaborn
import seaborn as sns

#matplotlib
import matplotlib.pyplot as plt
import plotly

#Geospacial
import folium

## **Objective**

The objective of this part of the project is to create a subset of the Michelin Star Restaurants(1,2,&3 stars together) that are in the inspections data, and find out what the most common violations are for each star rating. Also to find out which teir of stars have better ratings overall, and which boro had the highest scores for Michelin Star Restaurants.

### **Loading in Datasets**

In [2]:
m1 = pd.read_csv('one-star-michelin-restaurants.csv')
m2 = pd.read_csv('two-stars-michelin-restaurants.csv')
m3 = pd.read_csv('three-stars-michelin-restaurants.csv')

In [3]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549 entries, 0 to 548
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       549 non-null    object 
 1   year       549 non-null    int64  
 2   latitude   549 non-null    float64
 3   longitude  549 non-null    float64
 4   city       547 non-null    object 
 5   region     549 non-null    object 
 6   zipCode    400 non-null    object 
 7   cuisine    549 non-null    object 
 8   price      398 non-null    object 
 9   url        549 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 43.0+ KB


In [4]:
m2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       110 non-null    object 
 1   year       110 non-null    int64  
 2   latitude   110 non-null    float64
 3   longitude  110 non-null    float64
 4   city       110 non-null    object 
 5   region     110 non-null    object 
 6   zipCode    77 non-null     object 
 7   cuisine    110 non-null    object 
 8   price      90 non-null     object 
 9   url        110 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 8.7+ KB


In [5]:
m2['city'].value_counts()

New York                  14
Hong Kong                 12
San Francisco              7
Los Angeles                6
Seoul                      5
Taipei                     5
Mayfair                    5
Macau                      5
Singapore                  5
Bangkok                    4
Wien                       4
Chicago                    3
København                  3
Stockholm                  2
Salzburg                   2
North Kensington           2
Washington, D.C.           2
Great Milton               1
Port Isaac                 1
Nottingham                 1
Cambridge                  1
Cartmel                    1
Marlow                     1
Aughton                    1
Summerhouse                1
Bloomsbury                 1
Hyde Park                  1
Järpen                     1
Auchterarder               1
Skåne-Tranås               1
Malmö                      1
São Paulo - 01411          1
São Paulo - 05416          1
Rio de Janeiro - 22441     1
City Centre   

In [6]:
m3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       36 non-null     object 
 1   year       36 non-null     int64  
 2   latitude   36 non-null     float64
 3   longitude  36 non-null     float64
 4   city       36 non-null     object 
 5   region     36 non-null     object 
 6   zipCode    24 non-null     object 
 7   cuisine    36 non-null     object 
 8   price      31 non-null     object 
 9   url        36 non-null     object 
dtypes: float64(2), int64(1), object(7)
memory usage: 2.9+ KB


In [7]:
m3['city'].value_counts()

Hong Kong              7
San Francisco          6
New York               5
Macau                  3
Seoul                  2
Bray                   2
Mayfair                2
Wien                   1
South San Francisco    1
Chicago                1
København              1
Oslo                   1
Stockholm              1
Taipei                 1
Washington, D.C.       1
Chelsea                1
Name: city, dtype: int64

## **Combining the Michelin Star Datasets**

In [8]:
# Add a 'stars' column to each DataFrame
m1['stars'] = 1
m2['stars'] = 2
m3['stars'] = 3

In [9]:
# Concatenate the three datasets into one
all_stars_df = pd.concat([m1, m2, m3], ignore_index=True)

In [10]:
# Viewing the dataset now with the stars column added
all_stars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       695 non-null    object 
 1   year       695 non-null    int64  
 2   latitude   695 non-null    float64
 3   longitude  695 non-null    float64
 4   city       693 non-null    object 
 5   region     695 non-null    object 
 6   zipCode    501 non-null    object 
 7   cuisine    695 non-null    object 
 8   price      519 non-null    object 
 9   url        695 non-null    object 
 10  stars      695 non-null    int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 59.9+ KB


In [11]:
all_stars_df.head()

Unnamed: 0,name,year,latitude,longitude,city,region,zipCode,cuisine,price,url,stars
0,Kilian Stuba,2019,47.34858,10.17114,Kleinwalsertal,Austria,87568,Creative,$$$$$,https://guide.michelin.com/at/en/vorarlberg/kl...,1
1,Pfefferschiff,2019,47.83787,13.07917,Hallwang,Austria,5300,Classic cuisine,$$$$$,https://guide.michelin.com/at/en/salzburg-regi...,1
2,Esszimmer,2019,47.80685,13.03409,Salzburg,Austria,5020,Creative,$$$$$,https://guide.michelin.com/at/en/salzburg-regi...,1
3,Carpe Diem,2019,47.80001,13.04006,Salzburg,Austria,5020,Market cuisine,$$$$$,https://guide.michelin.com/at/en/salzburg-regi...,1
4,Edvard,2019,48.216503,16.36852,Wien,Austria,1010,Modern cuisine,$$$$,https://guide.michelin.com/at/en/vienna/wien/r...,1


In [12]:
all_stars_df.isna().sum()

name           0
year           0
latitude       0
longitude      0
city           2
region         0
zipCode      194
cuisine        0
price        176
url            0
stars          0
dtype: int64

## **Function dropping missing data**

In [13]:
# Function to drop columns with too much missing data to be relevant
def drop_columns(all_stars_df):
    # Check if 'zipcode' and 'price' columns exist in the dataframe
    if 'zipCode' in all_stars_df.columns and 'price' in all_stars_df.columns:
        # Drop the specified columns
        dataframe = all_stars_df.drop(['zipCode', 'price'], axis=1)
        print("Columns 'zipCode' and 'price' dropped successfully.")
    else:
        print("Columns 'zipCode' and 'price' not found.")

# Example usage:
# Assuming 'your_dataframe' is the name of your DataFrame
drop_columns(all_stars_df)

Columns 'zipCode' and 'price' dropped successfully.


## **Filtering the restaurants in NYC out**

In [14]:
# Filter for restaurants in New York City
nyc_restaurants = all_stars_df[all_stars_df['city'] == 'New York']
print(nyc_restaurants)

                              name  year   latitude  longitude      city  \
204                      Del Posto  2019  40.743270  -74.00770  New York   
205      Le Grill de Joël Robuchon  2019  40.742897  -74.00770  New York   
206                       L'Appart  2019  40.711903  -74.01544  New York   
207                          Okuda  2019  40.743793  -74.00633  New York   
208                         Wallsé  2019  40.735380  -74.00814  New York   
..                             ...   ...        ...        ...       ...   
680                           Masa  2019  40.768550  -73.98335  New York   
681                         Per Se  2019  40.768280  -73.98292  New York   
682                   Le Bernardin  2019  40.761770  -73.98223  New York   
683            Eleven Madison Park  2019  40.741700  -73.98712  New York   
684  Chef's Table at Brooklyn Fare  2019  40.688720  -73.98581  New York   

            region zipCode       cuisine price  \
204  New York City   10011       Ital

In [15]:
# Filter for Michelin star restaurants in New York
msny = all_stars_df[(all_stars_df['city'] == 'New York')]

In [16]:
#Looking to make sure only NYC restaurants were included
print(msny)

                              name  year   latitude  longitude      city  \
204                      Del Posto  2019  40.743270  -74.00770  New York   
205      Le Grill de Joël Robuchon  2019  40.742897  -74.00770  New York   
206                       L'Appart  2019  40.711903  -74.01544  New York   
207                          Okuda  2019  40.743793  -74.00633  New York   
208                         Wallsé  2019  40.735380  -74.00814  New York   
..                             ...   ...        ...        ...       ...   
680                           Masa  2019  40.768550  -73.98335  New York   
681                         Per Se  2019  40.768280  -73.98292  New York   
682                   Le Bernardin  2019  40.761770  -73.98223  New York   
683            Eleven Madison Park  2019  40.741700  -73.98712  New York   
684  Chef's Table at Brooklyn Fare  2019  40.688720  -73.98581  New York   

            region zipCode       cuisine price  \
204  New York City   10011       Ital

In [17]:
nyc_restaurants = msny

In [18]:
# Save the filtered dataset to a new CSV file
nyc_restaurants.to_csv("nyc_michelin_star_restaurants", index=False)

## **Filtering out the names of Michelin Star Restaurants in the Inspections Data** 

In [19]:
# Function to retrive list of Restaurant names from Michelin Star dataset to filter out of Inspections data
def compile_unique_restaurant_names(nyc_michelin_star_restaurants, name):
    """
    Compiles a list of unique restaurant names from a given dataset.

    Args:
    - dataset_filename (str): The filename of the dataset containing restaurant names.
    - restaurant_name_column (str): The name of the column in the dataset that contains restaurant names.

    Returns:
    - List of unique restaurant names.
    """
    # Get a list of unique restaurant names
    unique_names = nyc_restaurants[name].unique()

    return unique_names

In [20]:
#Names list
names_list = compile_unique_restaurant_names(nyc_restaurants, 'name')

In [21]:
#Looking at the names of Michelin star Restaurants in NYC
print(names_list)

['Del Posto' 'Le Grill de Joël Robuchon' "L'Appart" 'Okuda' 'Wallsé'
 'Jeju Noodle Bar' 'Sushi Nakazawa' 'Kosaka' 'Bâtard' 'Hirohisa'
 'Blue Hill' "ZZ's Clam Bar" 'Babbo' 'Carbone' 'Le Coucou' 'Aldea' 'Cote'
 'Bouley at Home' 'Gotham Bar and Grill' 'Nix' 'Junoon' 'NoMad'
 'The Musket Room' 'Uncle Boons' 'Noda' 'Gramercy Tavern' 'The Clocktower'
 'Ai Fiori' 'Casa Mono' 'Jewel Bako' 'The River Café' 'Bar Uchū'
 'Café China' 'Kanoyama' 'Contra' 'Atomix' 'Kyo Ya' 'Kajitsu' 'Agern'
 'Caviar Russe' 'Tuome' 'Tempura Matsui' 'Sushi Yasuda' 'Sushi Amane'
 'Café Boulud' 'Claro' 'Satsuki' 'Sushi Noz' 'Sushi Inoue' 'Meadowsweet'
 'Casa Enríque' 'Peter Luger' 'Oxomoco' 'The Finch' 'Faro'
 "L'Atelier de Joël Robuchon" 'Jungsik' 'Atera' 'Jean-Georges' 'Marea'
 'Gabriel Kreuther' 'Ichimura at Uchū' 'Sushi Ginza Onodera' 'Ko'
 'The Modern' 'Aquavit' 'Daniel' 'Aska' 'Blanca' 'Masa' 'Per Se'
 'Le Bernardin' 'Eleven Madison Park' "Chef's Table at Brooklyn Fare"]


In [22]:
NL = ['Del Posto' 'Le Grill de Joël Robuchon' "L'Appart" 'Okuda' 'Wallsé'
 'Jeju Noodle Bar' 'Sushi Nakazawa' 'Kosaka' 'Bâtard' 'Hirohisa'
 'Blue Hill' "ZZ's Clam Bar" 'Babbo' 'Carbone' 'Le Coucou' 'Aldea' 'Cote'
 'Bouley at Home' 'Gotham Bar and Grill' 'Nix' 'Junoon' 'NoMad'
 'The Musket Room' 'Uncle Boons' 'Noda' 'Gramercy Tavern' 'The Clocktower'
 'Ai Fiori' 'Casa Mono' 'Jewel Bako' 'The River Café' 'Bar Uchū'
 'Café China' 'Kanoyama' 'Contra' 'Atomix' 'Kyo Ya' 'Kajitsu' 'Agern'
 'Caviar Russe' 'Tuome' 'Tempura Matsui' 'Sushi Yasuda' 'Sushi Amane'
 'Café Boulud' 'Claro' 'Satsuki' 'Sushi Noz' 'Sushi Inoue' 'Meadowsweet'
 'Casa Enríque' 'Peter Luger' 'Oxomoco' 'The Finch' 'Faro'
 "L'Atelier de Joël Robuchon" 'Jungsik' 'Atera' 'Jean-Georges' 'Marea'
 'Gabriel Kreuther' 'Ichimura at Uchū' 'Sushi Ginza Onodera' 'Ko'
 'The Modern' 'Aquavit' 'Daniel' 'Aska' 'Blanca' 'Masa' 'Per Se'
 'Le Bernardin' 'Eleven Madison Park' "Chef's Table at Brooklyn Fare"]

The problem moving forward is that the restaurants appearing in the Michelin Star data, do not appear in the inspections data

## **Loading in Inspections Data**

In [23]:
# Loading NYC Restaurant Data
rdf = pd.read_csv('rdf')

In [24]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208342 entries, 0 to 208341
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  208342 non-null  int64  
 1   DBA                    207813 non-null  object 
 2   BORO                   208342 non-null  object 
 3   BUILDING               207986 non-null  object 
 4   STREET                 208334 non-null  object 
 5   ZIPCODE                205666 non-null  float64
 6   CUISINE DESCRIPTION    206020 non-null  object 
 7   INSPECTION DATE        208342 non-null  object 
 8   ACTION                 206020 non-null  object 
 9   VIOLATION CODE         204869 non-null  object 
 10  VIOLATION DESCRIPTION  204869 non-null  object 
 11  CRITICAL FLAG          208342 non-null  object 
 12  SCORE                  198599 non-null  float64
 13  GRADE                  102313 non-null  object 
 14  GRADE DATE             93639 non-nul

In [25]:
rdf.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,INSPECTION TYPE,Latitude,Longitude,Council District
0,50116556,WENDY'S,Queens,1,MAIN TERMINAL,,,01/01/1900,,,,Not Applicable,,,,,0.0,0.0,
1,50138702,SMOKEHOUSE,Manhattan,50,HUDSON YARDS,10001.0,,01/01/1900,,,,Not Applicable,,,,,40.754243,-73.99939,3.0
2,50135964,,Queens,20806,CROSS ISLAND PKWY,11360.0,,01/01/1900,,,,Not Applicable,,,,,40.788802,-73.785417,19.0
3,50110045,SEAFOOD KITCHEN,Queens,9517,57TH AVE,11368.0,,01/01/1900,,,,Not Applicable,,,,,40.73671,-73.865596,25.0
4,50043973,NAYA EXPRESS,Manhattan,1,NEW YORK PLAZA,10004.0,Mediterranean,01/30/2020,Violations were cited in the following area(s).,10J,Hand wash sign not posted,Not Critical,4.0,A,01/30/2020,Cycle Inspection / Initial Inspection,40.702533,-74.012562,1.0


## **Combining Inspections data with Michelin Star data**

In [26]:
#fldf = rdf[rdf['DBA'] == 'Kajitsu']

#if not fldf.empty:
#    print(f"{'Kajitsu'} found in the {'DBA'} column.")
#else:
#    print(f"{'Kajitsu'} not found in the {'DBA'} column.")

In [27]:
#print(filtered_dataset_df)

In [29]:
# Perform a fuzzy match based on coordinates
#merged_df = pd.merge(nyc_restaurants, rdf, how="outer", indicator=True, left_on=['longitude', 'latitude'], right_on=['longitude', 'latitude'])

# Filter for rows with a match in both datasets
#merged_df = merged_df[merged_df['_merge'] == 'both']

In [31]:
#merged_df.info()

In [None]:
# Optionally, save the merged dataset to a new CSV file
#merged_df.to_csv("merged_michelin_inspections.csv", index=False)

In [32]:
#merged_df.info()

## **Geospacial Visualizations**

In [33]:
# Create a base map
m = folium.Map(location=[all_stars_df['latitude'].mean(), all_stars_df['longitude'].mean()], zoom_start=12)

# Add markers for each restaurant
for index, row in all_stars_df.iterrows():
    folium.Marker([row['latitude'], row['longitude']], popup=row['name']).add_to(m)

# Display the map
m