### In this notebook, process data variables for Texas outlets for both Shake Shack and In-N-Out

### This notebook will create the dataset, Texas

In [1]:
import numpy as np
import pandas as pd
import plotly

plotly.offline.init_notebook_mode(connected=True)

# I. Collect Data Frames

### retrieve Shake Shack data and create two new columns

In [2]:
#Retrieve Shack Shack Data
df_ss_plano = pd.read_csv('./data/ss_plano.csv', header = None)
df_ss_dallas_north = pd.read_csv('./data/ss_dallas_north.csv', header = None)
df_ss_dallas_uptown = pd.read_csv('./data/ss_dallas_uptown.csv', header = None)

In [3]:
df_ss_plano['restaurant'] = ['Shake Shack'] * df_ss_plano.shape[0]
df_ss_plano['outlet'] = ['Plano'] * df_ss_plano.shape[0]

In [4]:
df_ss_dallas_north['restaurant'] = ['Shake Shack'] * df_ss_dallas_north.shape[0]
df_ss_dallas_north['outlet'] = ['Dallas North'] * df_ss_dallas_north.shape[0]

In [5]:
df_ss_dallas_uptown['restaurant'] = ['Shake Shack'] * df_ss_dallas_uptown.shape[0]
df_ss_dallas_uptown['outlet'] = ['Dallas Uptown'] * df_ss_dallas_uptown.shape[0]

### retrieve In-N-Out Data and create two new columns

In [6]:
df_ino_plano = pd.read_csv('./data/ino_plano.csv', header = None)
df_ino_dallas_north = pd.read_csv('./data/ino_dallas_north.csv', header = None)
df_ino_dallas_ug = pd.read_csv('./data/ino_dallas_ug.csv', header = None)

In [7]:
df_ino_plano['restaurant'] = ['In-N-Out'] * df_ino_plano.shape[0]
df_ino_plano['outlet'] = ['Plano'] * df_ino_plano.shape[0]

In [8]:
df_ino_dallas_north['restaurant'] = ['In-N-Out'] * df_ino_dallas_north.shape[0]
df_ino_dallas_north['outlet'] = ['Dallas North'] * df_ino_dallas_north.shape[0]

In [9]:
df_ino_dallas_ug['restaurant'] = ['In-N-Out'] * df_ino_dallas_ug.shape[0]
df_ino_dallas_ug['outlet'] = ['Dallas UG'] * df_ino_dallas_ug.shape[0]

### collect all data frames into a list and create one data frame

In [10]:
frames = [df_ss_plano, df_ss_dallas_north, df_ss_dallas_uptown, df_ino_plano, \
          df_ino_dallas_north, df_ino_dallas_ug]
df_orig = pd.concat(frames)

In [11]:
df_orig.columns = ['name', 'location', 'friends', 'reviews', 'photos', 'elite', 'review_date', 'rating', 
              'check_in', 'content', 'useful', 'date_reply', 'restaurant', 'outlet']

### select relevant columns for now

In [12]:
df = df_orig[['name', 'location', 'review_date', 'rating', 'content', 'date_reply', 'restaurant', 'outlet']].copy()

In [13]:
df.head()

Unnamed: 0,name,location,review_date,rating,content,date_reply,restaurant,outlet
0,Nancy T.,"Garland, TX",4/21/2018,4.0,It's a good quality burger and that's why it's...,,Shake Shack,Plano
1,De Amber P.,"Allen, TX",4/16/2018,5.0,Kind of upset it took me this long to find thi...,,Shake Shack,Plano
2,Jodi F.,"Katy, TX",4/15/2018,4.0,Had the smokehouse stack (with red peppers)- i...,,Shake Shack,Plano
3,Shalise G.,"Addison, Dallas, TX",1/21/2018,4.0,I've heard a lot of great things about this pl...,,Shake Shack,Plano
4,LG N.,"Prosper, TX",1/7/2018,4.0,1-18Tried this place out over the weekend whil...,,Shake Shack,Plano


In [14]:
df.shape

(1416, 8)

# II. Process Variables

### get reviewer's name

In [15]:
def get_name(x):    
    try:
        a = x.split(' ')
        a.pop()
        return ''.join(a)
    except:
        return x

df['name'] = df['name'].apply(lambda x: get_name(x))

In [16]:
df.head()

Unnamed: 0,name,location,review_date,rating,content,date_reply,restaurant,outlet
0,Nancy,"Garland, TX",4/21/2018,4.0,It's a good quality burger and that's why it's...,,Shake Shack,Plano
1,DeAmber,"Allen, TX",4/16/2018,5.0,Kind of upset it took me this long to find thi...,,Shake Shack,Plano
2,Jodi,"Katy, TX",4/15/2018,4.0,Had the smokehouse stack (with red peppers)- i...,,Shake Shack,Plano
3,Shalise,"Addison, Dallas, TX",1/21/2018,4.0,I've heard a lot of great things about this pl...,,Shake Shack,Plano
4,LG,"Prosper, TX",1/7/2018,4.0,1-18Tried this place out over the weekend whil...,,Shake Shack,Plano


### get state

In [17]:
#Location 
def get_origin(x):
    return x.split(',')[-1].strip()

df['state'] = df['location'].apply(lambda x: get_origin(x))

In [18]:
df['state'].unique()

array(['TX', 'CA', 'OK', 'UT', 'VA', 'NC', 'DC', 'FL', 'NY', 'AZ', 'OR',
       'MA', 'AR', 'TN', 'KY', 'SC', 'NE', 'MO', 'WA', 'OH', 'NV', 'AL',
       'GA', 'WI', 'ME', 'United Kingdom', 'HI', 'MS', 'PA', 'NH', 'NJ',
       'IN', 'KS', 'MI', 'IL', 'CO', 'LA', 'Japan', 'Italy',
       'Republic of Ireland', 'IA', 'MD', 'NM', 'CT', 'Canada'],
      dtype=object)

### create locality variable

In [19]:
def get_locality(x):
    if x == 'TX':
        return 'Local'
    else:
        return 'Non-local'

df['locality'] = df['state'].apply(lambda x: get_locality(x))

In [20]:
df[['location', 'locality']].head()

Unnamed: 0,location,locality
0,"Garland, TX",Local
1,"Allen, TX",Local
2,"Katy, TX",Local
3,"Addison, Dallas, TX",Local
4,"Prosper, TX",Local


In [21]:
df[['location', 'locality']].groupby(['locality']).count()

Unnamed: 0_level_0,location
locality,Unnamed: 1_level_1
Local,1120
Non-local,296


In [22]:
df['state'].unique()

array(['TX', 'CA', 'OK', 'UT', 'VA', 'NC', 'DC', 'FL', 'NY', 'AZ', 'OR',
       'MA', 'AR', 'TN', 'KY', 'SC', 'NE', 'MO', 'WA', 'OH', 'NV', 'AL',
       'GA', 'WI', 'ME', 'United Kingdom', 'HI', 'MS', 'PA', 'NH', 'NJ',
       'IN', 'KS', 'MI', 'IL', 'CO', 'LA', 'Japan', 'Italy',
       'Republic of Ireland', 'IA', 'MD', 'NM', 'CT', 'Canada'],
      dtype=object)

### create new column date as review date in date format

In [23]:
def transform(x):
    date = x.split('\n')[0]    
    lis = date.split('/')[::-1]
    if len(lis[1]) == 1:
        lis[1] = ''.join(['0',lis[1]])
    if len(lis[2]) == 1:
        lis[2] = ''.join(['0',lis[2]])
    return '-'.join([lis[0],lis[2],lis[1]])

df['date'] = df['review_date'].apply(lambda x: transform(x))

In [24]:
df['date'] = pd.to_datetime(df['date'])

In [25]:
df.head()

Unnamed: 0,name,location,review_date,rating,content,date_reply,restaurant,outlet,state,locality,date
0,Nancy,"Garland, TX",4/21/2018,4.0,It's a good quality burger and that's why it's...,,Shake Shack,Plano,TX,Local,2018-04-21
1,DeAmber,"Allen, TX",4/16/2018,5.0,Kind of upset it took me this long to find thi...,,Shake Shack,Plano,TX,Local,2018-04-16
2,Jodi,"Katy, TX",4/15/2018,4.0,Had the smokehouse stack (with red peppers)- i...,,Shake Shack,Plano,TX,Local,2018-04-15
3,Shalise,"Addison, Dallas, TX",1/21/2018,4.0,I've heard a lot of great things about this pl...,,Shake Shack,Plano,TX,Local,2018-01-21
4,LG,"Prosper, TX",1/7/2018,4.0,1-18Tried this place out over the weekend whil...,,Shake Shack,Plano,TX,Local,2018-01-07


### extract year, month, and day as new columns (helps in sorting!)

In [26]:
#Year
def get_year(x):
    return int(x.year) 

df['year'] = df['date'].apply(lambda x: get_year(x))

In [27]:
df[['date','year']].head()

Unnamed: 0,date,year
0,2018-04-21,2018
1,2018-04-16,2018
2,2018-04-15,2018
3,2018-01-21,2018
4,2018-01-07,2018


In [28]:
#Month
def get_month(x):
    return int(x.month) 

df['month'] = df['date'].apply(lambda x: get_month(x))

In [29]:
df[['date','month']].head()

Unnamed: 0,date,month
0,2018-04-21,4
1,2018-04-16,4
2,2018-04-15,4
3,2018-01-21,1
4,2018-01-07,1


In [30]:
#Day
def get_day(x):
    return int(x.day) 

df['day'] = df['date'].apply(lambda x: get_day(x))

In [31]:
df[['date','day']].head()

Unnamed: 0,date,day
0,2018-04-21,21
1,2018-04-16,16
2,2018-04-15,15
3,2018-01-21,21
4,2018-01-07,7


# III. Save Dataset

In [32]:
df.to_pickle('./Data/Texas')