In [1]:
def fetch_restaurant(restaurant_name, location_name):
   
    """
    Search restaurant by its name and location, get the fuzzy match and return its attributes and reviews.
    An index quantifying similarity and an error_code indicating potential errors are also returned.
    
    Parameters:
    -----------
    
    TEMPLATE
    (restaurant_name = str, location_name = str) -> attributes = pd.Series, reviewer = pd.DataFrame, fetched_restaurant = str, similarity_index = int, error_code = int / str
    
    INPUT
    restaurant_name: separated by space if multiple words incur
    location_name: in the order of: Building, Street, Boroughs. separated by space
    
    OUTPUT
    attributes: attributes of the restaurant, dummies with few exceptions
    review: include rating, text, date and reviewees' info
    fetched_restaurant: name of the fetched restaurant, possibly different from search name
    similarity_index: similarity between fetched name and search name, measured by fuzz.ratio function
    error_code: if empty str, nomral / if int, no data is returned / if non-empty str, incomplete data is returned
    ''  normal
    -1  fail to connect to search page
    -2  time out, fail to fetch potential restaurants list
    -3  fail to connect to restaurant page
    str errouneous reveiws indicated by str. '0/0/20/' indicates 2 errouneous reviews in the first batch of 20 reviews and 1 in the second.
    """
    
    
    from proxycrawl.proxycrawl_api import ProxyCrawlAPI
    from bs4 import BeautifulSoup
    from fuzzywuzzy import fuzz
    import pandas as pd
    import numpy as np
    import requests
    import time
    
    
    api = ProxyCrawlAPI({ 'token': 'LSfVp_sCuFbLJh87H_4T3g' })
    
    
    # set error_code to be normal
    error_code = 0
    
    
    # set maximal trails to be 100
    n = 0
    
    MAXIMUM_TRAIL = 3
    while(n < MAXIMUM_TRAIL):
        time.sleep(np.random.uniform(1, 3))
        search_link = 'https://www.yelp.com/search?find_desc=' + restaurant_name + '&find_loc=' + location_name + '&ns=1'
        try:
            search_content = BeautifulSoup(requests.get(search_link).text)
        except:
            error_code = -1
            return (None, None, None, None, error_code)
        fetched_restaurants = search_content.find_all('a', {'class': "lemon--a__373c0__1_OnJ link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5"})
        if fetched_restaurants == []:
            fetched_restaurants = search_content.find_all('a', {'class': "biz-name js-analytics-click"})
        if fetched_restaurants != []:
            break
        n += 1
    
    if n == MAXIMUM_TRAIL:
        error_code = -2
        return (None, None, None, None, error_code)
    
    del n 
    
    
    # choose the most matching restaurant on the first seach page
    similarity = np.zeros(len(fetched_restaurants), dtype = 'int')
    for fetched_restaurant_index, fetched_restaurant in enumerate(fetched_restaurants):
        similarity[fetched_restaurant_index] = fuzz.ratio(restaurant_name.strip().lower(), fetched_restaurant.get_text().strip().lower())
    
    fetched_restaurant = fetched_restaurants[similarity.argmax()].get_text().strip()
    similarity_index = similarity.max()
    
    
    # set number of reviews to be 0
    n_reviews = 0
    
    # fetch restaurant data
    try:
        restaurant_link = 'https://www.yelp.com' + fetched_restaurants[similarity.argmax()].attrs['href'].split('?')[0] + '?start=' + str(n_reviews)
        restaurant_content = BeautifulSoup(requests.get(restaurant_link).text)
    except:
        error_code = -3
        return (None, None, None, None, error_code)
    
    try:
        overall_rating = restaurant_content.find('div', {'class': 'biz-rating biz-rating-very-large clearfix'}).find_next().attrs['title'].strip().split()[0].strip()
    except:
        overall_rating = np.nan
    try:
        overall_reviews = restaurant_content.find('span', {'class': 'review-count rating-qualifier'}).get_text().strip().split(' ')[0].strip()
    except:
        overall_reviews = np.nan
    try:
        price_range = restaurant_content.find('dd', {'class': 'nowrap price-description'}).get_text().strip()
    except:
        price_range = np.nan
    try:
        health_score = restaurant_content.find('dd', {'class': 'nowrap health-score-description'}).get_text().strip()
    except:
        health_score = np.nan
    head = restaurant_content.find('div', {'class':'short-def-list'})
    try:
        names = head.find_all('dt')
        values = head.find_all('dd')
    except:
        pass
    attributes = pd.Series(np.full(30, np.nan), index = ['Accepts Credit Cards',
                                                         'Accepts Bitcoin',
                                                         'Accepts Insurance',
                                                         'Alcohol',
                                                         'Appointment Only',
                                                         'Caters',
                                                         'Coat Check',
                                                         'Delivers',
                                                         'Dogs Allowed'
                                                         'Hair Types Specialized In',
                                                         'Happy Hour',
                                                         'TV',
                                                         'Outdoor Seating',
                                                         'Parking',
                                                         'Smoking Allowed',
                                                         'Take-out',
                                                         'Takes Reservations',
                                                         'Waiter Service',
                                                         'Wheelchair Accessible',
                                                         'Wi-Fi',
                                                         'Opened 24hrs',
                                                         'Gender Neutral Bathrooms'
                                                         'Ambience',
                                                         'Attire',
                                                         'Best Nights',
                                                         'Good For Dancing',
                                                         'Good For Groups',
                                                         'Good For Kids',
                                                         'Good For Meals Served',
                                                         'Music',
                                                         'Noise Level',
                                                         'Price Range'
                                                        ])
    for name, value in zip(names, values):
        if name.get_text().strip().capitalize() in attributes:
            attributes[name.get_text().strip().capitalize()] = value.get_text().strip()
    
    
    attributes = attributes.append(pd.Series({'price_range': price_range, 
                                              'health_score': health_score, 
                                              'overall_rating': overall_rating, 
                                              'overall_reviews': overall_reviews}))
    
    # fetch reviews data
    reviewer = pd.DataFrame({'er_name': [],
                             'er_location': [],
                             'er_freinds': [],
                             'er_reviews':[],
                             'er_photos': [],
                             'er_rating': [],
                             'er_date': [],
                             'er_text': [],
                             'er_useful': [],
                             'er_funny': [],
                             'er_cool': []
                            })
    
    error_code = ""
    
    MAXIMUM_REVIEW = 20000
    while(n_reviews < MAXIMUM_REVIEW):
        try:
            reviews = restaurant_content.find_all('div', {'class':'review review--with-sidebar'})
            if reviews == []:
                break
            for review in reviews:
                text_tag = review.find('p')
                if text_tag.attrs['lang'].strip() != 'en':
                    continue
                er_text = text_tag.get_text().strip()
                er_name = review.find('a', {'class': 'user-display-name js-analytics-click'}).get_text().strip()
                er_location = review.find('li', {'class': 'user-location responsive-hidden-small'}).get_text().strip()
                try:
                    er_friends = review.find('li', {'class': 'friend-count responsive-small-display-inline-block'}).get_text().strip().split(' ')[0].strip()
                except:
                    er_friends = 0
                try:
                    er_reviews = review.find('li', {'class': 'review-count responsive-small-display-inline-block'}).get_text().strip().split(' ')[0].strip()
                except:
                    er_reviews = 0
                try:
                    er_photos = review.find('li', {'class': 'photo-count responsive-small-display-inline-block'}).get_text().strip().split(' ')[0].strip()
                except:
                    er_photos = 0
                er_rating = review.find('div', {'class': 'biz-rating biz-rating-large clearfix'}).find_next().find_next().attrs['title'].strip().split(' ')[0].strip()
                er_date = review.find('span', {'class': 'rating-qualifier'}).get_text().strip()
                statistics = review.find_all('span', {'class': 'count'})
                er_useful = statistics[0].get_text().strip()
                er_funny =  statistics[1].get_text().strip()
                er_cool = statistics[2].get_text().strip()
                reviewer = reviewer.append({'er_name': er_name,
                                            'er_location': er_location,
                                            'er_freinds': er_friends,
                                            'er_reviews': er_reviews,
                                            'er_photos': er_photos,
                                            'er_rating': er_rating,
                                            'er_date': er_date,
                                            'er_text': er_text,
                                            'er_useful': er_useful,
                                            'er_funny': er_funny,
                                            'er_cool': er_cool
                                }, ignore_index = True)
        except:
            error_code = error_code + str(n_reviews) + '/'
            continue
        n_reviews += 20
        time.sleep(np.random.uniform(1, 3))
        try:
            restaurant_link = 'https://www.yelp.com' + fetched_restaurants[similarity.argmax()].attrs['href'].split('?')[0] + '?start=' + str(n_reviews)
            restaurant_content = BeautifulSoup(requests.get(restaurant_link).text)
        except:
            error_code = -3
            return (None, None, None, None, error_code)
    
    
    return (attributes, reviewer, fetched_restaurant, similarity_index, error_code)

In [2]:
def fetch_restaurants(data):
   
    """
    Fetch restaurants data from yelp and write them into three types of files:
    <CAMIS>.csv contains reviews
    attributes.csv contains restaurant attributes
    info.csv contains technical info during scraping
    
    Parameters:
    -----------
    
    TEMPLATE
    (data = pd.DataFrame) -> None
    
    INPUT
    data DataFrame that contains columns of CAMIS, DBA, BUILDING, STREET and BORO
    """
    
    
    from datetime import datetime
    import pandas as pd
    import os
    
    directory = '../Yelp'
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    attributes_ls = []
    reviewer_ls = []
    info = pd.DataFrame({'CAMIS':[],
                         'DBA':[],
                         'fetched_restaurant':[],
                         'similarity_index':[],
                         'error_code':[]
                        })
    for index, row in data.iterrows():
        try:
            restaurant_name = row['DBA']
            location_name = row['BUILDING'] + ' ' + row['STREET'] + ' ' + row['BORO']
            (attributes, reviewer, fetched_restaurant, similarity_index, error_code) = fetch_restaurant(restaurant_name, location_name)
            if isinstance(error_code, int): # error code is int. no data is returned.
                info = info.append({'CAMIS': row['CAMIS'],
                                'DBA': row['DBA'],
                                'fetched_restaurant': np.nan,
                                'similarity_index': np.nan,
                                'error_code': error_code
                                    }, ignore_index = True)
                continue
            attributes.rename(row['CAMIS'], inplace = True)
            attributes_ls.append(attributes)
            temp_reviewer_path = '../Yelp/' + str(row['CAMIS']) + '.csv'
            if os.path.isfile(temp_reviewer_path):
                os.remove(temp_reviewer_path)
            reviewer.to_csv(temp_reviewer_path) # save review information of each restaurant to separate files, indicated by CAMIS 
            info = info.append({'CAMIS': row['CAMIS'],
                                'DBA': row['DBA'],
                                'fetched_restaurant': fetched_restaurant,
                                'similarity_index': similarity_index,
                                'error_code': error_code,
                                'record_time': datetime.now()
                               }, ignore_index = True)
        except:
            continue
    
    
    header_attr = not os.path.isfile('../Yelp/attributes.csv') 
    header_info = not os.path.isfile('../Yelp/info.csv') 
    pd.DataFrame(attributes_ls).to_csv('../Yelp/attributes.csv', mode = 'a', header = header_attr) # save restaurant attributes
    info.to_csv('../Yelp/info.csv', mode = 'a', header = header_info) # save scraping info

In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv('../Data/NYC_Rest_Inspect.csv')
raw_data['INSPECTION DATE'] = pd.to_datetime(raw_data['INSPECTION DATE'], format = '%m/%d/%Y')
raw_data['GRADE DATE'] = pd.to_datetime(raw_data['GRADE DATE'], format = '%m/%d/%Y')
raw_data['RECORD DATE'] = pd.to_datetime(raw_data['RECORD DATE'], format = '%m/%d/%Y')
data = raw_data[ (raw_data["BORO"] == "MANHATTAN") & (raw_data["CUISINE DESCRIPTION"] == "American") ]

In [6]:
data.shape

(43184, 18)

In [17]:
np.random.seed(0)
index = np.random.randint(0, data.shape[0] + 1, size = 1000)

In [20]:
data.iloc[index[0:10]]

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE
24050,50005183,COCO & CRU/SWEETWATER SOCIAL,MANHATTAN,643,BROADWAY,10012.0,2122530477,American,2015-11-23,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Critical,35.0,,NaT,2018-11-12,Cycle Inspection / Initial Inspection
377867,41073985,HEARTLAND BREWERY,MANHATTAN,350,5 AVENUE,10118.0,2125633433,American,2018-04-13,Violations were cited in the following area(s).,06C,Food not protected from potential source of co...,Critical,13.0,A,2018-04-13,2018-11-12,Cycle Inspection / Initial Inspection
187794,40526406,CLUB MACANUDO (CIGAR BAR),MANHATTAN,26,EAST 63 STREET,10065.0,2127528200,American,2017-07-20,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Critical,31.0,,NaT,2018-11-12,Cycle Inspection / Initial Inspection
269480,40671007,PIG & WHISTLE ON 3RD,MANHATTAN,922,3 AVENUE,10022.0,2126884646,American,2017-09-08,Violations were cited in the following area(s).,10B,Plumbing not properly installed or maintained;...,Not Critical,4.0,A,2017-09-08,2018-11-12,Cycle Inspection / Re-inspection
284415,41479731,STILLWATER BAR & GRILL,MANHATTAN,7880,EAST 4 STREET,,2122532237,American,2017-10-17,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rins...",Critical,12.0,A,2017-10-17,2018-11-12,Cycle Inspection / Re-inspection
372034,41068920,THE SKINNY,MANHATTAN,174,ORCHARD STREET,10002.0,2122283668,American,2018-02-22,Establishment Closed by DOHMH. Violations wer...,04L,Evidence of mice or live mice present in facil...,Critical,33.0,,NaT,2018-11-12,Cycle Inspection / Re-inspection
183453,50012158,MEZZROW,MANHATTAN,163,W 10TH ST,10014.0,6464764346,American,2016-02-10,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,10.0,A,2016-02-10,2018-11-12,Cycle Inspection / Initial Inspection
132646,41520610,BAREBURGER,MANHATTAN,535,LAGUARDIA PLACE,10012.0,2124778125,American,2017-06-02,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rins...",Critical,14.0,,NaT,2018-11-12,Cycle Inspection / Initial Inspection
137029,40567533,THE THIRSTY SCHOLAR,MANHATTAN,155,2 AVENUE,10003.0,2127776514,American,2016-08-04,Violations were cited in the following area(s).,06F,Wiping cloths soiled or not stored in sanitizi...,Critical,12.0,A,2016-08-04,2018-11-12,Cycle Inspection / Initial Inspection
349498,41467165,CHELSEA PAPAYA,MANHATTAN,171,WEST 23 STREET,10011.0,2123529060,American,2015-11-27,Violations were cited in the following area(s).,04M,Live roaches present in facility's food and/or...,Critical,19.0,,NaT,2018-11-12,Cycle Inspection / Initial Inspection


In [None]:
fetch_restaurants(batch1)

In [21]:
from proxycrawl.proxycrawl_api import ProxyCrawlAPI
url = 'https://it.yahoo.com/?p=us'
api = ProxyCrawlAPI({ 'token': 'LSfVp_sCuFbLJh87H_4T3g' })
response = api.get(url, {'user_agent': 'Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/30.0'})

In [25]:
from bs4 import Beautifuresponse['body']

