In [1]:
import json
import sys
import pandas as pd
from keys import client_id, api_key
import requests



In [2]:
def generate_yelp_data():
# THIS FUNCTION IS USED AS A TESTING GROUND - RUN THIS FUNCTION TO GRAB ONE SET OF COUNT=50 SET OF DATA ONLY

    url =  "https://api.yelp.com/v3/businesses/search"
    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 1000

    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "limit": 50,
                }
    
    results = yelp_call(url, url_params, api_key)
    parsed_results, num = parse_results(results)
    parsed_reviews = {}
#    parsed_reviews = generate_all_reviews(parsed_results.id, parsed_reviews)
    return parsed_results, parsed_reviews
    
def generate_all_yelp_data():
# THIS FUNCTION IS THE MAIN DRIVER - SETS SEARCH PARAMETERS, CALLS YELP API, PARSES RETURNED DATA AND STORED IN CSV/DATAFRAME

    term = 'Indian'
    location = 'New York'
    categories = "restaurants"
    radius = 10000
    price = 4
    
    url =  "https://api.yelp.com/v3/businesses/search"
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "limit": 50,
#                "price": price,
                }

    all_results = pd.DataFrame()
    num = 1
    cur = 0
    
    while (cur < num and cur < 1000):

        url_params['offset'] = cur
        results = yelp_call(url, url_params, api_key)
        parsed_results, num = parse_results(results)
        df_save('all_ny_4star.csv', parsed_results)
        all_results = all_results.append(parsed_results)
        cur += 50

    return all_results

def generate_all_reviews(businesses_list, reviews_dictionary):
# THIS FUNCTION IS CALLED TO CREATE A LIST OF REVIEWS FOR ALL BUSINESSES WITHIN ITS PASSED DATA SET

    for biz in businesses_list:
        biz_reviews = get_yelp_reviews(biz)
        reviews_dictionary['{}'.format(biz)] = biz_reviews
    
    reviews_results = pd.DataFrame(reviews_dictionary.items(), columns=['business_id', 'reviews_list'])
     
    return reviews_results
    
def get_yelp_reviews(business_id):
# THIS FUNCTION CALLS YELP API TO RETRIEVE ALL REVIEWS FOR A GIVEN BUSINESS ID

    business_reviews = []
    headers = {'Authorization': 'Bearer {}'.format(api_key),}
    url =  "https://api.yelp.com/v3/businesses/{}/reviews".format(business_id)

    review_response = requests.get(url, headers=headers)
    review_data = json.loads(review_response.text)
        
    for review in review_data['reviews']:
        review_dict = {'id': review['id'], 'text': review['text'], 'rating': review['rating'], 'time_created': review['time_created']}
        business_reviews.append(review_dict)

    return business_reviews

def yelp_call(url, url_params, api_key):
#THIS FUNCTION IS CALLED TO MAKE API CALL TO YELP AND RETURN THE RESPONSE

    headers = {'Authorization': 'Bearer {}'.format(api_key),}

    response = requests.get(url, headers=headers, params=url_params)
#    response.status_code
#    response.text

    return response

def parse_data(list_of_data):
#THIS FUNCTION CREATES A TUPLE OF BUSINESS INFORMATION FOR ALL BUSINESSES CAPTURED VIA YELP

    businesses=[]
    
    for business in list_of_data:
        get_categories = parse_categories(business['categories'])    
        num_categories = len(get_categories)
        if 'price' in business.keys():
            price_val = len(business['price'])
        else:
            price_val = 0
        biz_tuple = (business['id'], business['name'], get_categories, num_categories, price_val, business['review_count'], business['rating'], business['location']['address1'], business['location']['city'], business['location']['state'])
        businesses.append(biz_tuple)

    return businesses

def parse_results(results):
# THIS FUNCTION TAKES RESULTS RETURNED FROM YELP API, PARSES THE DATA, AND STORES AS A DATAFRAME

    #EITHER ONE WORKS
    #data = json.loads(response.text)
    data = results.json()
    num = data['total']
    
    parsed_results = parse_data(data['businesses'])
    df = pd.DataFrame(parsed_results, columns=['id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
    return df, num

def parse_categories(categories_list):
# THIS FUNCTION TAKES THE LIST OF CATEGORIES FROM A BUSINESS AND CREATES A LIST OF CATEGORIES OF THE BUSINESS

    list_of_categories = []
    x_len = len(categories_list)
    for x in range(0,x_len):
        list_of_categories.append(categories_list[x]['alias'])
    return list_of_categories

def df_save(csv_filepath, parsed_results):
# THIS FUNCTION SAVES A DATAFRAME AS A CSV FILE
    
    f = open(csv_filepath, 'a', encoding="utf-8")
    parsed_results.to_csv(f, header=False)
    f.close()


In [8]:
# CALL THIS LINE TO GENERATE ALL YELP DATA BASED ON CRITERIA ABOVE AND STORE INTO CSV
all_results = generate_all_yelp_data()

In [4]:
# CALL THESE LINES TO LOAD AN EXISTING CSV DATA SET SAVED IN DIRECTORY AND PLAY WITH IT
all_results = pd.read_csv('indian_bk.csv', header=None, names=['id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
all_results

Unnamed: 0,id,name,categories,num_categories,price,review_count,rating,street_address,city,state
0,lnI9toC-uAhBIbtHDMxvcA,Indika House,"['indpak', 'asianfusion']",2,2,74,5.0,943 Broadway,Brooklyn,NY
1,ZqOxAKVZEZTa4XS4zthp3Q,Bombay Grill,['indpak'],1,2,206,4.5,1176 Bedford Ave,Brooklyn,NY
2,dGE1Imqoz1vdHa2cjfvO1w,Gandhi Fine Indian Cuisine,['indpak'],1,2,365,4.0,2032 Bedford Ave,Brooklyn,NY
3,YDtIHtfvWd4W36wq6UAjoA,Kitchen Grill Indian Restaurant,['indpak'],1,2,272,4.5,914A Fulton St,Brooklyn,NY
4,HY7Hx-z5lq1vAacDwYBfoA,Diwan Grill Indian Cuisine,"['indpak', 'desserts', 'pakistani']",3,2,98,4.5,678 Franklin Ave,Brooklyn,NY
...,...,...,...,...,...,...,...,...,...,...
45,Cq2IVq7VOlEzQ0YfdPqc-w,Amin's Chinese Halal Restaurant,"['chinese', 'halal']",2,2,47,3.5,215 Tonnele Ave,Jersey City,NJ
46,FoR70YoGSb4_QavCe1WZEw,Beyond Sushi,"['vegetarian', 'vegan', 'gluten_free']",3,2,926,4.5,229 E 14th St,New York,NY
47,_nScWpFX3M1K7s30hFEsRw,Inday,"['indpak', 'salad', 'gluten_free']",3,2,349,4.0,1133 Broadway,New York,NY
48,ZC6md0z2Uy6qJLZTaMFiKA,Pizza Party Time,"['halal', 'pizza', 'burgers']",3,2,45,2.5,599 Broad Ave,Ridgefield,NJ


In [None]:
# CALL THIS FUNCTION TO TEST FUNCTIONALITY TO GRAB/STORE RETURNED YELP DATA FOR ONE SET OF 50 ONLY
new_data, new_reviews = generate_yelp_data()
new_reviews

In [5]:
all_results.groupby(['price'])['review_count','rating'].agg(['count', 'mean','median'])

Unnamed: 0_level_0,review_count,review_count,review_count,rating,rating,rating
Unnamed: 0_level_1,count,mean,median,count,mean,median
price,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,229,19.0,8,229,3.906114,4.0
1,245,159.040816,70,245,3.761224,4.0
2,504,258.077381,167,504,3.771825,4.0
3,20,445.4,417,20,3.825,4.0
4,2,420.0,420,2,4.25,4.25
