##### Library Import

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
##setting working directory
os.chdir('C:/Users/Jeric/OneDrive/Desktop/Personal Projects/NYC_restaurants')

In [3]:
nyc = pd.read_csv("C:/Users/Jeric/OneDrive/Desktop/Personal Projects/NYC_restaurants/food_order.csv")

# Preliminary look at data

In [4]:
nyc.head()

Unnamed: 0,order_id,customer_id,restaurant_name,cuisine_type,cost_of_the_order,day_of_the_week,rating,food_preparation_time,delivery_time
0,1477147,337525,Hangawi,Korean,30.75,Weekend,Not given,25,20
1,1477685,358141,Blue Ribbon Sushi Izakaya,Japanese,12.08,Weekend,Not given,25,23
2,1477070,66393,Cafe Habana,Mexican,12.23,Weekday,5,23,28
3,1477334,106968,Blue Ribbon Fried Chicken,American,29.2,Weekend,3,25,15
4,1478249,76942,Dirty Bird to Go,American,11.59,Weekday,4,25,24


In [5]:
#data dimensions
row = len(nyc.axes[0])
column = len(nyc.axes[1])
print("Number of columns: ",column,",and number of rows: ",row)


Number of columns:  9 ,and number of rows:  1898


In [6]:
nyc.dtypes

order_id                   int64
customer_id                int64
restaurant_name           object
cuisine_type              object
cost_of_the_order        float64
day_of_the_week           object
rating                    object
food_preparation_time      int64
delivery_time              int64
dtype: object

Categorical variables: restaurant name, cuisine type, rating (ordinal)

Numerical: cost of order, food preparation, delivery time

In [18]:
#Looking for missings
nyc.isnull().sum()
#nyc.isnull().sum().sort_values(ascending = False)/len(nyc) for percentage of missings

order_id                 0.0
customer_id              0.0
restaurant_name          0.0
cuisine_type             0.0
cost_of_the_order        0.0
day_of_the_week          0.0
rating                   0.0
food_preparation_time    0.0
delivery_time            0.0
dtype: float64

In [15]:
rating_distribution = nyc.groupby("rating").size().sort_values(ascending=False)
print(rating_distribution)

rating
Not given    736
5            588
4            386
3            188
dtype: int64


There are 736 orders that do not have a rating which is approximately 38.78% of the data set, making it difficult to ignore and simply delete. 

### Why the empty ratings?
A quote from Himanshu Khanna, CEO of Sparklin, states that "Personality, mood, environment, urgency of the requirement, and eventual gratification all weigh in on how a user rates something". Usually it is strong feelings that spur customers to give feedback. A customer having a standard experience may not neccesarily find leaving a review worth the effort.

### Solution?
It is possible to assume that the missing data can be categorized as missing at random (MAR) which means that the reason that people do not give a rating may be related to other variables within the data. A possible solution would then be to either only observe the data with ratings or to impute the missing data. For this notebook, The missing data will be imputed and observed then compared to the unmodified data in terms of visualization.

In [None]:
Imputation

In [8]:
#find summary statistics
print(nyc.loc[:,~nyc.columns.isin(['order_id','customer_id'])].describe().round(2))

       cost_of_the_order  food_preparation_time  delivery_time
count            1898.00                1898.00        1898.00
mean               16.50                  27.37          24.16
std                 7.48                   4.63           4.97
min                 4.47                  20.00          15.00
25%                12.08                  23.00          20.00
50%                14.14                  27.00          25.00
75%                22.30                  31.00          28.00
max                35.41                  35.00          33.00


Table of cuisine types

In [9]:
#limit the display rows to 20 for readability
pd.options.display.max_rows = 20

In [13]:
restaurant_orders = nyc.groupby(["cuisine_type","restaurant_name"]).size().sort_values(ascending=False)
print(restaurant_orders.head(10))

cuisine_type  restaurant_name              
American      Shake Shack                      219
Japanese      Blue Ribbon Sushi                119
Italian       The Meatball Shop                112
American      Blue Ribbon Fried Chicken         96
Italian       Parm                              68
Chinese       RedFarm Broadway                  59
              RedFarm Hudson                    55
Japanese      TAO                               49
Chinese       Han Dynasty                       46
Japanese      Blue Ribbon Sushi Bar & Grill     44
dtype: int64


In [11]:
avgprice_cuisine = nyc.groupby("cuisine_type")["cost_of_the_order"].mean().sort_values(ascending=False)
display(avgprice_cuisine)

cuisine_type
French            19.793889
Southern          19.300588
Thai              19.207895
Spanish           18.994167
Middle Eastern    18.820612
Mexican           16.933117
Indian            16.919726
Italian           16.418691
American          16.319829
Chinese           16.305209
Japanese          16.304532
Mediterranean     15.474783
Korean            14.001538
Vietnamese        12.882857
Name: cost_of_the_order, dtype: float64

In [12]:
avgprep_cuisine = nyc.groupby("cuisine_type")["food_preparation_time"].mean().sort_values(ascending=False)
display(avgprep_cuisine)

cuisine_type
Southern          27.588235
Chinese           27.511628
Japanese          27.510638
Italian           27.483221
American          27.440068
Thai              27.315789
Indian            27.109589
Mediterranean     27.000000
Spanish           26.916667
French            26.888889
Mexican           26.727273
Middle Eastern    26.673469
Vietnamese        25.714286
Korean            25.461538
Name: food_preparation_time, dtype: float64