In [1]:
import numpy as np
import pandas as pd

In [2]:
xl = pd.ExcelFile('./data/shake_shack.xlsx')

In [3]:
print(xl.sheet_names)

['Shake Shack', 'In-N-Out', 'States']


In [5]:
df = xl.parse('Shake Shack')

In [6]:
df.head()

Unnamed: 0,Address,City_State,Zip_Code,Reviews,Rating_Mean,First_Review_Date,Present_Date,Months_Open,Years_Open,Reviews_Per_Month
0,"6201 Hollywood Blvd, Ste 104","Hollywood, CA",90028,431,3.5,2016-10-12,2018-05-10,19.166667,1.597222,22.486957
1,8520 Santa Monica Blvd,"West Hollywood, CA",90069,1564,3.5,2016-03-15,2018-05-10,26.2,2.183333,59.694656
2,400 W 8th St,"Los Angeles, CA",90014,81,3.5,2018-03-07,2018-05-10,2.133333,0.177778,37.96875
3,10250 Santa Monica Blvd,"Los Angeles, CA",90067,320,3.5,2017-01-21,2018-05-10,15.8,1.316667,20.253165
4,252 S Brand Blvd,"Glendale, CA",91204,974,3.5,2016-09-22,2018-05-10,19.833333,1.652778,49.109244


In [7]:
#Note: Website says 106 but only got 102
df.shape

(105, 10)

In [8]:
df.columns

Index(['Address', 'City_State', 'Zip_Code', 'Reviews', 'Rating_Mean',
       'First_Review_Date', 'Present_Date', 'Months_Open', 'Years_Open',
       'Reviews_Per_Month'],
      dtype='object')

### - Extract State

In [9]:
def get_city(x):
    return x.split(',')[0]

df['City'] = df['City_State'].apply(lambda x: get_city(x))

In [10]:
def get_state(x):
    return x.split(',')[1]

df['State'] = df['City_State'].apply(lambda x: get_state(x).strip())

### - Create Years_Open

In [11]:
df['Years_Open'] = df['Months_Open']/12

In [12]:
df.head()

Unnamed: 0,Address,City_State,Zip_Code,Reviews,Rating_Mean,First_Review_Date,Present_Date,Months_Open,Years_Open,Reviews_Per_Month,City,State
0,"6201 Hollywood Blvd, Ste 104","Hollywood, CA",90028,431,3.5,2016-10-12,2018-05-10,19.166667,1.597222,22.486957,Hollywood,CA
1,8520 Santa Monica Blvd,"West Hollywood, CA",90069,1564,3.5,2016-03-15,2018-05-10,26.2,2.183333,59.694656,West Hollywood,CA
2,400 W 8th St,"Los Angeles, CA",90014,81,3.5,2018-03-07,2018-05-10,2.133333,0.177778,37.96875,Los Angeles,CA
3,10250 Santa Monica Blvd,"Los Angeles, CA",90067,320,3.5,2017-01-21,2018-05-10,15.8,1.316667,20.253165,Los Angeles,CA
4,252 S Brand Blvd,"Glendale, CA",91204,974,3.5,2016-09-22,2018-05-10,19.833333,1.652778,49.109244,Glendale,CA


In [13]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [13]:
#plt.hist(df['Years_Open'], bins = 30, facecolor = 'g')

## a. Number of Years Restaurant Was Open

In [14]:
#plt.figure(figsize=(12,6))
#cm = plt.cm.get_cmap('RdYlBu_r')

# Get the histogramp
#Y,X = np.histogram(df['Years_Open'], 20, normed=1)
#x_span = X.max()-X.min()
#C = [cm(((x-X.min())/x_span)) for x in X]

#plt.bar(X[:-1],Y,color=C,width=X[1]-X[0])
#plt.show()

In [30]:
import plotly

plotly.offline.init_notebook_mode(connected=True)

In [None]:
from plotly.graph_objs import Figure, Histogram, Layout
min_ = df['Years_Open'].min()
max_ = df['Years_Open'].max()
data = [Histogram(x=df['Years_Open'], 
                  xbins=dict(start=min_,
                             end=max_,
                             size=(max_-min_)/15))]
layout = Layout(title="Histogram of Years Open",
                bargap=0.2)
fig = Figure(data=data, layout=layout)

In [31]:
plotly.offline.iplot(fig, show_link=False, image_width=600, image_height=400)

#### - This graph shows that there are many relatively new restaurants and have been open for 5 or more years.

## b. Table for Sorting

In [25]:
df[['Address','City_State','Reviews','Rating_Mean','Years_Open']].sort_values(['Years_Open'],ascending=False).head(10)

Unnamed: 0,Address,City_State,Reviews,Rating_Mean,Years_Open
80,E 23rd St & Madison Ave,"New York, NY",5360,4.0,12.988889
86,366 Columbus Ave,"New York, NY",2228,4.0,9.686111
91,12301 Roosevelt Ave,"Queens, NY",154,3.5,9.230556
87,154 E 86th St,"New York, NY",1260,4.0,8.452778
45,1111 Lincoln Rd,"Miami Beach, FL",820,4.0,8.225
85,691 8th Ave,"New York, NY",2835,4.0,7.947222
97,267 Union Ave,"Saratoga Springs, NY",18,4.0,7.875
50,1216 18th St NW,"Washington, DC",1289,3.5,7.083333
77,215 Murray St,"New York, NY",848,4.0,7.041667
55,1500 S Capitol St SE,"Washington, DC",82,3.0,6.997222


In [16]:
def get_state(x):
    return x.split(',')[-1].strip()

df['State'] = df['City_State'].apply(lambda x: get_state(x))

In [18]:
#df_NY = df[df['State'] == 'NY']
#df_NY[['Address', 'City_State', 'Reviews','Rating_Mean', 'Reviews_Per_Month',\
#    'Years_Open', 'Months_Open', 'State']].sort_values(['Reviews'], ascending=False)

In [19]:
#df_NY.count()

In [18]:
group = df[['State','City_State']].groupby(['State'])

In [19]:
group.count().sort_values('City_State', ascending=False).head(15)

Unnamed: 0_level_0,City_State
State,Unnamed: 1_level_1
NY,26
CA,9
TX,9
FL,8
DC,7
PA,5
NV,5
NJ,5
MD,4
IL,4


## c. Average Rating vs Years Open

In [None]:
from plotly.graph_objs import Scatter, Layout, Figure
#from plotly.graph_objs import Figure, Histogram

data = [Scatter(x=df['Years_Open'], y=df['Rating_Mean'], mode = 'markers')]#, text=df['movie_title'])]
layout = Layout(title="Average Rating vs Years Open")

fig = Figure(data=data, layout=layout)

In [32]:
plotly.offline.iplot(fig, show_link=False)

#### - This graph shows that some ratings have not improved even after years restaurant was open.

In [19]:
#df.plot(kind='scatter',x='Months_Open', y='Reviews_Per_Month')

#from plotly.graph_objs import Scatter, Layout, Figure

#data = [Scatter(x=df['Years_Open'], y=df['Reviews_Per_Month'], mode = 'markers')]#, text=df['movie_title'])]
#layout = Layout(title="Reviews Per Month vs Years Open")

#fig = Figure(data=data, layout=layout)

#plotly.offline.iplot(fig, show_link=False)

In [27]:
import plotly.plotly as py
import pandas as pd

fig = Figure({
    'data': [
        {
            'x': df[df['Rating_Mean']==rating]['Years_Open'],
            'y': df[df['Rating_Mean']==rating]['Reviews_Per_Month'],
            'name': rating, 'mode': 'markers',
        } for rating in [3, 3.5, 4, 4.5]
    ],
    'layout': {
        'xaxis': {'title': 'Years Open', 'type': 'linear'},
        'yaxis': {'title': "Reviews Per Month"}
    }
})

plotly.offline.iplot(fig, show_link=False)

#### - This graph shows that the rate of reviews per month are high for some points early on.  Most restaurants have less than 20-30 reviews per month. 

In [21]:
df['Rating_Mean'].unique()

array([3.5, 3. , 4. , 4.5])

In [22]:
df['log_Reviews'] = df['Reviews'].apply(lambda x: np.log(x))

In [None]:
import plotly.plotly as py
import pandas as pd

fig = Figure({
    'data': [
        {
            'x': df[df['Rating_Mean']==rating]['Years_Open'],
            'y': df[df['Rating_Mean']==rating]['log_Reviews'],
            'name': rating, 'mode': 'markers',
        } for rating in [3, 3.5, 4, 4.5]
    ],
    'layout': {
        'xaxis': {'title': 'Years Open', 'type': 'linear'},
        'yaxis': {'title': "log Number of Reviews"}
    }
})


In [33]:
plotly.offline.iplot(fig, show_link=False)