In [1]:
import json
import sys
import pandas as pd
import requests
from matplotlib import pyplot as plt
from keys import client_id, api_key

In [16]:
def generate_yelp_data():
# THIS FUNCTION IS USED AS A TESTING GROUND - RUN THIS FUNCTION TO GRAB ONE SET OF COUNT=50 SET OF DATA ONLY

    url =  "https://api.yelp.com/v3/businesses/search"
    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 1000

    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
                }
    
    results = yelp_call(url, url_params, api_key)
    parsed_results, num = parse_results(results)
    parsed_reviews = generate_all_reviews(parsed_results.id)
    return parsed_results, parsed_reviews


________________________________________________________________


def generate_all_yelp_data():
# THIS FUNCTION IS THE MAIN DRIVER - SETS SEARCH PARAMETERS, CALLS YELP API, PARSES RETURNED DATA AND STORED IN CSV/DATAFRAME

    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 1000
    price = 4
    
    url =  "https://api.yelp.com/v3/businesses/search"
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
#                "price": price,
                }

    all_results = pd.DataFrame()
    num = 1
    cur = 0
    
    while (cur < num and cur < 1000):

        url_params['offset'] = cur
        results = yelp_call(url, url_params, api_key)
        parsed_results, num = parse_results(results)
        parsed_reviews = generate_all_reviews(parsed_results.business_id)
        df_save('data/test_data.csv', parsed_results)
        df_save('data/test_reviews.csv', parsed_reviews)
        all_results = all_results.append(parsed_results)
        cur += 50

    return all_results


________________________________________________________________



def generate_all_reviews(businesses_list):
# THIS FUNCTION IS CALLED TO CREATE A LIST OF REVIEWS FOR ALL BUSINESSES WITHIN ITS PASSED DATA SET

    all_business_reviews = []

    for biz in businesses_list:
        biz_reviews = get_yelp_reviews(biz)
        for each_business in biz_reviews:
            all_business_reviews.append(each_business)

    reviews_results = pd.DataFrame(all_business_reviews, columns = 
                                   ['business_id', 'review_id', 'text', 'rating', 'time_created'])
     
    return reviews_results


________________________________________________________________


    
def get_yelp_reviews(business_id):
# THIS FUNCTION CALLS YELP API TO RETRIEVE ALL REVIEWS FOR A GIVEN BUSINESS ID

    business_reviews = []
    headers = {'Authorization': 'Bearer {}'.format(api_key),}
    url =  "https://api.yelp.com/v3/businesses/{}/reviews".format(business_id)

    review_response = requests.get(url, headers = headers)
    review_data = json.loads(review_response.text)
        
    for review in review_data['reviews']:
        review_tuple = (business_id, review['id'], review['text'], review['rating'], review['time_created'])
        business_reviews.append(review_tuple)

    return business_reviews



________________________________________________________________



def yelp_call(url, url_params, api_key):
#THIS FUNCTION IS CALLED TO MAKE API CALL TO YELP AND RETURN THE RESPONSE

    headers = {'Authorization': 'Bearer {}'.format(api_key),}

    response = requests.get(url, headers=headers, params=url_params)
#    response.status_code
#    response.text

    return response


________________________________________________________________



def parse_data(list_of_data):
#THIS FUNCTION CREATES A TUPLE OF BUSINESS INFORMATION FOR ALL BUSINESSES CAPTURED VIA YELP

    businesses=[]
    
    for business in list_of_data:
        get_categories = parse_categories(business['categories'])    
        num_categories = len(get_categories)
        if 'price' in business.keys():
            price_val = len(business['price'])
        else:
            price_val = 0
        biz_tuple = (business['id'], business['name'], get_categories, num_categories, price_val, business['review_count'], business['rating'], business['location']['address1'], business['location']['city'], business['location']['state'])
        businesses.append(biz_tuple)

    return businesses


________________________________________________________________



def parse_results(results):
# THIS FUNCTION TAKES RESULTS RETURNED FROM YELP API, PARSES THE DATA, AND STORES AS A DATAFRAME

    #EITHER ONE WORKS
    #data = json.loads(response.text)
    data = results.json()
    num = data['total']
    
    parsed_results = parse_data(data['businesses'])
    df = pd.DataFrame(parsed_results, columns=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state', 'zip_code'])
    return df, num


________________________________________________________________



def parse_categories(categories_dict):
# THIS FUNCTION TAKES THE CATEGORIES VALUE FROM A BUSINESS AND CREATES A LIST OF CATEGORIES OF THE BUSINESS

    list_of_categories = []
    x_len = len(categories_dict)
    for x in range(0,x_len):
        list_of_categories.append(categories_dict[x]['alias'])
    return list_of_categories


________________________________________________________________



def df_save(csv_filepath, parsed_results):
# THIS FUNCTION SAVES A DATAFRAME AS A CSV FILE
    
    f = open(csv_filepath, 'a', encoding="utf-8")
    parsed_results.to_csv(f, header=False)
    f.close()


________________________________________________________________
    
    
    
def part_4_quesions_answers(results_dataframe, reviews_dataframe)

    most_reviewed_businesses = results_dataframe.sort_values('review_count',ascending=False)[0:5][['business_id','name','review_count']]
    high_rated_bus_count = len(results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].max()])
    bus_percent_by_rating = results_dataframe.groupby('rating').count().apply(lambda x: x/x.sum(), axis=0).business_id
    bus_percent_by_price = results_dataframe.groupby('price').count().apply(lambda x: x/x.sum(), axis=0).business_id
    #results_dataframe.sort_values('review_count',ascending=False)[0:1]['business_id'][1]
    most_reviewed_text = reviews_dataframe.loc[reviews_dataframe['business_id']==results_dataframe.loc[results_dataframe['review_count'].idxmax()]['business_id']].text

    max_rated_reviewed_biz = results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].max()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    max_rated_review_text = reviews_dataframe.loc[reviews_dataframe['business_id']==max_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text
    min_rated_reviewed_biz = results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].min()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    min_rated_review_text = reviews_dataframe.loc[reviews_dataframe['business_id']==min_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text

    print("Most reviewed business is: {}".format(most_reviewed_businesses))
    print("Num businesses with highest rating: {}".format(high_rated_bus_count))
    print("Percent of businesses by Rating: {}".format(bus_percent_by_rating))
    print("Percent of businesses by Price: {}".format(bus_percent_by_price))
    print("Sample review of most reviewed business: {}".format(most_reviewed_text))
    print("Latest review of highest rated, highest reviewed business: {}".format(max_rated_review_text))
    print("Latest review of lowest rated, lowest reviewed business: {}".format(min_rated_review_text))   
    

In [None]:
# VISUALIZATIONS

def rest_prices():
    title = "Indian Restaurant Prices"
    labels = ["Miami", "New York", "San Jose", "Chicago", "Washington, D.C.", "Los Angeles"]
    value = "Price Value in Dollars"
    volume = "Amount of Restaurants"
    width = 0.35  # the width of the bars

    # MULTI BAR CHART PLOTS
    x_mia = list(indian_mia.groupby('price').count()['business_id'].index)
    y_mia = list(indian_mia.groupby('price').count()['business_id'].values)
    x_ny = list(indian_bk.groupby('price').count()['business_id'].index)
    y_ny = list(indian_bk.groupby('price').count()['business_id'].values)
    x_sj = list(indian_sf.groupby('price').count()['business_id'].index)
    y_sj = list(indian_sf.groupby('price').count()['business_id'].values)
    x_chi = list(indian_chi.groupby('price').count()['business_id'].index)
    y_chi = list(indian_chi.groupby('price').count()['business_id'].values)
    x_dc = list(indian_dc.groupby('price').count()['business_id'].index)
    y_dc = list(indian_dc.groupby('price').count()['business_id'].values)
    x_la = list(indian_la.groupby('price').count()['business_id'].index)
    y_la = list(indian_la.groupby('price').count()['business_id'].values)


    fig, axes = plt.subplots(figsize = (20, 15), ncols=3, nrows=2)
    fig.suptitle(title, size = 35)

    # TOP LEFT
    axes[0,0].bar(x_mia, y_mia)
    axes[0,0].set_title(labels[0], size=20)
    axes[0,0].set_xlabel(value, size=15)
    axes[0,0].set_ylabel(volume, size=15)
    axes[0,0].set_xticks(indian_mia['price'].sort_values().unique())

    # TOP MIDDLE
    axes[0,1].bar(x_ny, y_ny)
    axes[0,1].set_title(labels[1], size=20)
    axes[0,1].set_xlabel(value, size=15)
    axes[0,1].set_ylabel(volume, size=15)
    axes[0,1].set_xticks(indian_ny['price'].sort_values().unique())

    # TOP RIGHT
    axes[0,2].bar(x_sj, y_sj)
    axes[0,2].set_title(labels[2], size=20)
    axes[0,2].set_xlabel(value, size=15)
    axes[0,2].set_ylabel(volume, size=15)
    axes[0,2].set_xticks(indian_sj['price'].sort_values().unique())

    # BOTTOM LEFT
    axes[1,0].bar(x_chi, y_chi)
    axes[1,0].set_title(labels[3], size=20)
    axes[1,0].set_xlabel(value, size=15)
    axes[1,0].set_ylabel(volume, size=15)
    axes[1,0].set_xticks(indian_chi['price'].sort_values().unique())

    # BOTTOM MIDDLE
    axes[1,1].bar(x_dc, y_dc)
    axes[1,1].set_title(labels[4], size=20)
    axes[1,1].set_xlabel(value, size=15)
    axes[1,1].set_ylabel(volume, size=15)
    axes[1,1].set_xticks(indian_dc['price'].sort_values().unique())

    # BOTTOM RIGHT
    axes[1,2].bar(x_la, y_la)
    axes[1,2].set_title(labels[5], size=20)
    axes[1,2].set_xlabel(value, size=15)
    axes[1,2].set_ylabel(volume, size=15)
    axes[1,2].set_xticks(indian_la['price'].sort_values().unique())
    ;
return ()

________________________________________________________________





In [17]:
# CALL THIS LINE TO GENERATE ALL YELP DATA BASED ON CRITERIA ABOVE AND STORE INTO CSV
all_results = generate_all_yelp_data()

In [19]:
# CALL THESE LINES TO LOAD AN EXISTING CSV DATA SET SAVED IN DIRECTORY AND PLAY WITH IT. 
# CLEANS THE 'CATEGORIES' COLUMN INTO A LIST
all_results = pd.read_csv('filepath', converters={'categories': eval}, header=None, 
                          names=['id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state', 'zip_code'])


In [31]:
all_results.groupby(['rating'])['review_count','num_categories'].agg(['count', 'mean','median'])

Unnamed: 0_level_0,review_count,review_count,review_count,num_categories,num_categories,num_categories
Unnamed: 0_level_1,count,mean,median,count,mean,median
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1.0,32,1.53125,1.0,32,1.96875,2
1.5,3,4.0,3.0,3,3.0,3
2.0,10,3.9,3.5,10,2.5,3
2.5,7,4.142857,3.0,7,2.285714,3
3.0,26,4.269231,2.0,26,2.346154,3
3.5,26,5.807692,5.0,26,2.076923,2
4.0,12,6.916667,5.5,12,2.25,3
4.5,20,17.7,11.5,20,2.15,3
5.0,107,3.579439,2.0,107,2.35514,3


In [52]:
all_results.sort_values(by='review_count', ascending=False)

Unnamed: 0,id,name,categories,num_categories,price,review_count,rating,street_address,city,state
0,ObeexiHvTJ9OomAnMTIY4g,Lebedin Kofman,"[duilawyers, criminaldefense, divorce]",3,0,88,4.5,26 Broadway,New York,NY
1,PBi4Ro3954ZDZvfYrY4btA,Spodek Law Group,"[divorce, criminaldefense]",2,0,56,5.0,85 Broad St,New York,NY
3,dJt8s35t8eTiPVXze1NCeg,"Law Offices of Deborah G Fiss, Esq.",[divorce],1,0,42,4.5,37-06 82nd St,Jackson Heights,NY
2,bmNdMOltxH52KJ8VxSdM_g,"Law Offices of Mindin & Mindin, P C","[divorce, general_litigation, legalservices]",3,0,37,5.0,61 Broadway,New York,NY
14,7a9KjUg0cmxJDQDjoXfI3w,Bhatt Law Group,"[divorce, criminaldefense, personal_injury]",3,0,35,4.5,378 Summit Ave,Jersey City,NJ
...,...,...,...,...,...,...,...,...,...,...
20,gGGUpudvgwMvtNB2gRkrqA,J.J Borer ESQ,[divorce],1,0,1,5.0,26 Court St,Brooklyn,NY
18,Gfb9p1-JCzbQt5DoivaH_w,Law Office of Rong Kohtz,"[divorce, immigrationlawyers]",2,0,1,5.0,30 Wall St,New York,NY
17,MGBfQA4mR6GNGlCc30Jnhw,Clover Barrett & Associates,"[bankruptcy, divorce, estateplanning]",3,0,1,5.0,338 Atlantic Ave,Brooklyn,NY
16,msfPq-lBjGbfUK12Yp7WWg,Maryam Jahedi Law Firm,"[criminaldefense, duilawyers, divorce]",3,0,1,5.0,65 Broadway,New York,NY


In [9]:
# CALL THIS FUNCTION TO TEST FUNCTIONALITY TO GRAB/STORE RETURNED YELP DATA FOR ONE SET OF 50 ONLY
new_data, new_reviews = generate_yelp_data()

In [9]:
# NEW REVIEWS
f = open('test_data.csv', 'a', encoding="utf-8")
new_data.to_csv(f, header=False)
f.close()