In [1]:
import json
import sys
import pandas as pd
from keys import client_id, api_key
import requests

In [60]:
def generate_yelp_data():
# THIS FUNCTION IS USED AS A TESTING GROUND - RUN THIS FUNCTION TO GRAB ONE SET OF COUNT=50 SET OF DATA ONLY

    # SETTING UP OF VARIABLES WHICH WILL BE PASSED TO THE YELP API
    url =  "https://api.yelp.com/v3/businesses/search"
    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 1000

    # SETTING UP OF PARAMETERS BASED ON VARIABLES DEFINED ABOVE, TO BE PASSED TO THE YELP API
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
                }
    
    
________________________________________________________________



    # CALLING THE FUNCTION WHICH WILL CALL THE YELP API AND STORING RESULT
    results = yelp_call(url, url_params, api_key)
    if (results.status_code == 200): # If valid status code...
        parsed_results, num = parse_results(results) # Call function to parse the returned results from Yelp with the relevant informatoin we need to capture
    #    parsed_reviews = generate_all_reviews(parsed_results.business_id)
    else:
        print("bad yelp call: code - {}".format(results.status_code))

    return parsed_results#, parsed_reviews
 
    
________________________________________________________________



def generate_all_yelp_data():
# THIS FUNCTION IS THE MAIN DRIVER - SETS SEARCH PARAMETERS, CALLS YELP API, PARSES RETURNED DATA AND STORED IN CSV/DATAFRAME
# COMMENTED OUT ALL_RESULTS DATAFRAME OBJECT TO NOT RETURN ANYTHING - UNCOMMENT TO TEST RUN THROUGH

    # SETTING UP OF VARIABLES WHICH WILL BE PASSED TO THE YELP API
    term = 'Indian'
    location = 'Los Angeles, CA'
    categories = "restaurants"
    radius = 40000
    
    # SETTING UP OF PARAMETERS BASED ON VARIABLES DEFINED ABOVE, TO BE PASSED TO THE YELP API
    url =  "https://api.yelp.com/v3/businesses/search"
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
                }

    #These are counter variables to keep track of how many businesses are returned
    num = 1 # value passed to this variable below will be the number of businesses which are returned based on parameters above
    cur = 0 # counter variable
    
    while (cur < num and cur < 1000):

        url_params['offset'] = cur
        # CALLING THE FUNCTION WHICH WILL CALL THE YELP API AND STORING RESULT
        results = yelp_call(url, url_params, api_key)
        if (results.status_code == 200): #If returned results are valid...
            parsed_results, num = parse_results(results) # Function called to parse the returned results and output only the relevant information we require
            parsed_reviews = generate_all_reviews(parsed_results.business_id) # Function called to generate all reviews for businesses which are passed to it
            df_save('data/indian_la_data_FINAL.csv', parsed_results) # Saves outputted files into CSV files
            df_save('data/indian_la_reviews_FINAL.csv', parsed_reviews)
            cur += 50 #increment the counter by 50 due to yelp feature of returning 50 results at a time
        else:
            print("Error in Yelp Call: <status_code> = {}".format(results.status_code))
            num = 0
            
            
________________________________________________________________



def yelp_call(url, url_params, api_key):
#THIS FUNCTION IS CALLED TO MAKE API CALL TO YELP AND RETURN THE RESPONSE

    headers = {'Authorization': 'Bearer {}'.format(api_key),}

    #Command to call to access Yelp API
    response = requests.get(url, headers=headers, params=url_params)

    return response


________________________________________________________________



def parse_results(results):
# THIS FUNCTION TAKES RESULTS RETURNED FROM YELP API, PARSES THE DATA, AND STORES AS A DATAFRAME

    data = results.json()
    num = data['total']
    
    parsed_results = parse_data(data['businesses'])
    df = pd.DataFrame(parsed_results, columns=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
    return df, num


________________________________________________________________



def parse_data(list_of_data):
#THIS FUNCTION CREATES A TUPLE OF BUSINESS INFORMATION FOR ALL BUSINESSES CAPTURED VIA YELP

    businesses=[]
    
    #For each business in the yelp returned dataset
    for business in list_of_data:
        get_categories = parse_categories(business['categories']) #parses the categories in a list
        num_categories = len(get_categories)
        if 'price' in business.keys(): #converts "price level" to an integer value
            price_val = len(business['price'])
        else:
            price_val = 0
        biz_tuple = (business['id'], business['name'], get_categories, num_categories, price_val, business['review_count'], business['rating'], business['location']['address1'], business['location']['city'], business['location']['state'])
        businesses.append(biz_tuple)

    return businesses


________________________________________________________________



def parse_categories(categories_dict):
# THIS FUNCTION TAKES THE CATEGORIES VALUE FROM A BUSINESS AND CREATES A LIST OF CATEGORIES OF THE BUSINESS

    list_of_categories = []
    x_len = len(categories_dict)
    for x in range(0,x_len):
        list_of_categories.append(categories_dict[x]['alias'])
    return list_of_categories


________________________________________________________________



def generate_all_reviews(businesses_list):
# THIS FUNCTION IS CALLED TO CREATE A LIST OF REVIEWS FOR ALL BUSINESSES WITHIN ITS PASSED DATA SET

    all_business_reviews = [] #List created to store all of the reviews based on dataframe of businesses passed

    for biz in businesses_list:
        biz_reviews = get_yelp_reviews(biz) #returns a list of reviews for particular business, typically three separate lists
        for each_business in biz_reviews:
            all_business_reviews.append(each_business)  #parsing the list of list of reviews and appends each one to the main list

    #Convert to DataFrame
    reviews_results = pd.DataFrame(all_business_reviews, columns=['business_id', 'review_id', 'text', 'rating', 'time_created'])
     
    return reviews_results


________________________________________________________________



def get_yelp_reviews(business_id):
# THIS FUNCTION CALLS YELP API TO RETRIEVE ALL REVIEWS FOR A GIVEN BUSINESS ID

    business_reviews = []
    headers = {'Authorization': 'Bearer {}'.format(api_key),}
    url =  "https://api.yelp.com/v3/businesses/{}/reviews".format(business_id)

    review_response = requests.get(url, headers=headers)
    review_data = json.loads(review_response.text)
        
    #For each review returned, we capture the relevant information and store it as a tuple for safe passage
    for review in review_data['reviews']:
        review_tuple = (business_id, review['id'], review['text'], review['rating'], review['time_created'])
        business_reviews.append(review_tuple) 

    return business_reviews


________________________________________________________________



def df_save(csv_filepath, parsed_results):
# THIS FUNCTION SAVES A DATAFRAME AS A CSV FILE
    
    f = open(csv_filepath, 'a', encoding="utf-8")
    parsed_results.to_csv(f, header=False)
    f.close()

    
________________________________________________________________
    
    
    
def part_4_questions_answers(results_dataframe, reviews_dataframe):
# THIS FUNCTION IS CALLED TO DISPLAY THE PART 4 Q&A RESULTS
    
    most_reviewed_businesses = results_dataframe.sort_values('review_count',ascending=False)[0:5][['name','review_count']]
    highest_rating = results_dataframe['rating'].max()
    high_rated_bus_count = len(results_dataframe.loc[results_dataframe['rating']==highest_rating])#results_dataframe['rating'].max()])
    bus_percent_by_rating = results_dataframe.groupby('rating').count().apply(lambda x: 100*x/x.sum(), axis=0).business_id
    bus_percent_by_price = results_dataframe.groupby('price').count().apply(lambda x: 100*x/x.sum(), axis=0).business_id
    #results_dataframe.sort_values('review_count',ascending=False)[0:1]['business_id'][1]
    most_reviewed_text = all_reviews.loc[all_reviews['business_id']==all_results.sort_values(by='review_count', ascending=False).head(1)['business_id'][1]].text
    max_rated_reviewed_biz = results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].max()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    max_rated_review_text = reviews_dataframe.loc[reviews_dataframe['business_id']==max_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text
    min_rated_reviewed_biz = results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].min()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    min_rated_review_text = reviews_dataframe.loc[reviews_dataframe['business_id']==min_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text

    print("1. Top 5 Most reviewed businesses are:")
    print(most_reviewed_businesses)
    print()
    print("2. Number of businesses with highest rating of {}: {}".format(highest_rating, high_rated_bus_count))
    print()
    print("3&4. Percent of businesses by Rating: {}".format(bus_percent_by_rating))
    print()
    print("5. Percent of businesses by Price: {}".format(bus_percent_by_price))
    print()
    print("6. Sample reviews of most reviewed business:")
    print(most_reviewed_text)
    print()
    print("7. Latest review of highest rated, highest reviewed business:")
    print(max_rated_review_text)
    print()
    print("8. Latest review of lowest rated, lowest reviewed business:")
    print(min_rated_review_text)

    
________________________________________________________________



indian_dc = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_dc.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_dc.head(10)


________________________________________________________________



indian_la = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_la.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_la.head(10)


________________________________________________________________



indian_ny = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_ny.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_ny.head(10)


________________________________________________________________



indian_chi = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_chi.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_chi.head(10)


________________________________________________________________



indian_mia = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_mia.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_mia.head(10)


________________________________________________________________



indian_sj = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_sj.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_sj.head(10)


________________________________________________________________


# INDIAN RESTAURANT PRICES
def indian_rest_price(indian_mia, indian_ny, indian_sj, indian_chi, indian_dc, indian_la):
    title = "Indian Restaurant Prices"
    labels = ["Miami", "New York", "Chicago", "San Jose", "Washington, D.C.", "Los Angeles"]
    value = "Price Value in Dollars"
    volume = "Amount of Restaurants"
    width = 0.35  # the width of the bars

    # MULTI BAR CHART PLOTS
    x_mia = list(indian_mia.groupby('price').count()['business_id'].index)
    y_mia = list(indian_mia.groupby('price').count()['business_id'].values)
    x_ny = list(indian_ny.groupby('price').count()['business_id'].index)
    y_ny = list(indian_ny.groupby('price').count()['business_id'].values)
    x_sj = list(indian_sj.groupby('price').count()['business_id'].index)
    y_sj = list(indian_sj.groupby('price').count()['business_id'].values)
    x_chi = list(indian_chi.groupby('price').count()['business_id'].index)
    y_chi = list(indian_chi.groupby('price').count()['business_id'].values)
    x_dc = list(indian_dc.groupby('price').count()['business_id'].index)
    y_dc = list(indian_dc.groupby('price').count()['business_id'].values)
    x_la = list(indian_la.groupby('price').count()['business_id'].index)
    y_la = list(indian_la.groupby('price').count()['business_id'].values)


    fig, axes = plt.subplots(figsize = (20, 15), ncols=3, nrows=2)
    fig.suptitle(title, size = 35)

    # TOP LEFT
    axes[0,0].bar(x_mia, y_mia, color = 'r')
    axes[0,0].set_title(labels[0], size=20)
    axes[0,0].set_xlabel(value, size=15)
    axes[0,0].set_ylabel(volume, size=15)
    axes[0,0].set_xticks(indian_mia['price'].sort_values().unique())

    # TOP MIDDLE
    axes[0,1].bar(x_ny, y_ny, color = 'b')
    axes[0,1].set_title(labels[1], size=20)
    axes[0,1].set_xlabel(value, size=15)
    axes[0,1].set_ylabel(volume, size=15)
    axes[0,1].set_xticks(indian_ny['price'].sort_values().unique())

    # TOP RIGHT
    axes[0,2].bar(x_chi, y_chi, color = 'y')
    axes[0,2].set_title(labels[2], size=20)
    axes[0,2].set_xlabel(value, size=15)
    axes[0,2].set_ylabel(volume, size=15)
    axes[0,2].set_xticks(indian_sj['price'].sort_values().unique())

    # BOTTOM LEFT
    axes[1,0].bar(x_sj, y_sj, color = 'darkorange')
    axes[1,0].set_title(labels[3], size=20)
    axes[1,0].set_xlabel(value, size=15)
    axes[1,0].set_ylabel(volume, size=15)
    axes[1,0].set_xticks(indian_chi['price'].sort_values().unique())

    # BOTTOM MIDDLE
    axes[1,1].bar(x_dc, y_dc, color = 'green')
    axes[1,1].set_title(labels[4], size=20)
    axes[1,1].set_xlabel(value, size=15)
    axes[1,1].set_ylabel(volume, size=15)
    axes[1,1].set_xticks(indian_dc['price'].sort_values().unique())

    # BOTTOM RIGHT
    axes[1,2].bar(x_la, y_la, color = 'maroon')
    axes[1,2].set_title(labels[5], size=20)
    axes[1,2].set_xlabel(value, size=15)
    axes[1,2].set_ylabel(volume, size=15)
    axes[1,2].set_xticks(indian_la['price'].sort_values().unique())

# plt.savefig('indian_rest_prices.png')
;


________________________________________________________________



# RATINGS VS REVIEWS
def rating_vs_review(indian_sj, indian_dc, indian_la):
    title = "Ratings & Reviews Correlation"
    labels = ["San Jose", "Washington, D.C.", "Los Angeles"]
    value = "Markets"
    volume = "Reviews Baseline"

    y_sj = list(indian_sj['review_count'])
    y_dc = list(indian_dc['review_count'])
    y_la = list(indian_la['review_count'])

    fig, ax = plt.subplots(figsize= (20, 10))
    ax.boxplot([y_sj, y_dc, y_la], labels = labels, patch_artist = True)
    ax.set_facecolor('bisque')
    ax.set_title(title, size=30)
    ax.set_xlabel(value, size=20)
    ax.set_ylabel(volume, size=20)
    ax.set_ylim(0, indian_dc['review_count'].max())
    ax.yaxis.grid(True)

# plt.savefig('ratings_review_correlation')
;


________________________________________________________________


# MEAN PRICES OF TARGET MARKETS
def target_mean_price(indian_sj, indian_dc, indian_la):
    title = "Target Market Mean Price"
    labels = ["New York", "Miami", "Chicago", "San Jose", "D.C.", "Los Angeles"]
    value = "Markets"
    volume = "Mean Price Value"

    ny_mean = indian_ny['price'].mean()
    mia_mean = indian_mia['price'].mean()
    sj_mean = indian_sj['price'].mean()
    chi_mean = indian_chi['price'].mean()
    dc_mean = indian_dc['price'].mean()
    la_mean = indian_la['price'].mean()

    x_vals = [labels[0], labels[1], labels[2], labels[3], labels[4], labels[5]]
    y_vals = [ny_mean, mia_mean, chi_mean, sj_mean, dc_mean, la_mean]

    fig, ax = plt.subplots()

    ax.bar(labels, y_vals, color=['grey', 'grey', 'grey', 'darkorange', 'green', 'maroon'])
    ax.set_title(title, size=20)
    ax.set_xlabel(value, size=15)
    ax.set_ylabel(volume, size=15)

# plt.savefig('mean_price_target')
;


________________________________________________________________


# TOP 3 MARKETS REVIEWS
def top_3_market_reviews(indian_sj, indian_dc, indian_la):
    title = "Top 3 Markets Review Distribution"
    labels = ["San Jose", "D.C.", "Los Angeles"]
    ticks = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
    value = "Top 3 Markets"
    volume = "Review Counts; Price = 3"

    sj_rev = list(indian_sj.query('price == 3')['review_count'])
    sj_rat = list(indian_sj.query('price == 3')['rating'])
    dc_rev = list(indian_dc.query('price == 3')['review_count'])
    dc_rat = list(indian_dc.query('price == 3')['rating'])
    la_rev = list(indian_la.query('price == 3')['review_count'])
    la_rat = list(indian_la.query('price == 3')['rating'])

    fig, axes = plt.subplots(figsize = (20, 15), ncols=3)
    fig.suptitle(title, size = 35)

    # BOTTOM LEFT
    axes[0].scatter(sj_rat, sj_rev, color = 'darkorange')
    axes[0].set_title(labels[0], size=20)
    axes[0].set_xlabel(value, size=20)
    axes[0].set_ylabel(volume, size=20)
    axes[0].set_xticks(ticks)

    # BOTTOM MIDDLE
    axes[1].scatter(dc_rat, dc_rev, color = 'green')
    axes[1].set_title(labels[1], size=20)
    axes[1].set_xlabel(value, size=20)
    axes[1].set_ylabel(volume, size=20)
    axes[1].set_xticks(ticks)

    # BOTTOM RIGHT
    axes[2].scatter(la_rat, la_rev, color = 'maroon')
    axes[2].set_title(labels[2], size=20)
    axes[2].set_xlabel(value, size=20)
    axes[2].set_ylabel(volume, size=20)
    axes[2].set_xticks(ticks)

# plt.savefig('top_3_review_dist')
;


________________________________________________________________


# TOP 3 MARKETS RATINGS
def top_3_market_ratings(indian_sj, indian_dc, indian_la):
    title = "Top 3 Markets Review Distribution"
    labels = ["San Jose", "D.C.", "Los Angeles"]
    ticks = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
    value = "Top 3 Markets"
    volume = "Review Counts; Price = 3"

    sj_rev = list(indian_sj.query('price == 3')['review_count'])
    sj_rat = list(indian_sj.query('price == 3')['rating'])
    dc_rev = list(indian_dc.query('price == 3')['review_count'])
    dc_rat = list(indian_dc.query('price == 3')['rating'])
    la_rev = list(indian_la.query('price == 3')['review_count'])
    la_rat = list(indian_la.query('price == 3')['rating'])

    fig, axes = plt.subplots(figsize = (20, 15), ncols=3)
    fig.suptitle(title, size = 35)

    # BOTTOM LEFT
    axes[0].bar(sj_rat, sj_rev, color = 'darkorange')
    axes[0].set_title(labels[0], size=20)
    axes[0].set_xlabel(value, size=20)
    axes[0].set_ylabel(volume, size=20)
    axes[0].set_xticks(ticks)

    # BOTTOM MIDDLE
    axes[1].bar(dc_rat, dc_rev, color = 'green')
    axes[1].set_title(labels[1], size=20)
    axes[1].set_xlabel(value, size=20)
    axes[1].set_ylabel(volume, size=20)
    axes[1].set_xticks(ticks)

    # BOTTOM RIGHT
    axes[2].bar(la_rat, la_rev, color = 'maroon')
    axes[2].set_title(labels[2], size=20)
    axes[2].set_xlabel(value, size=20)
    axes[2].set_ylabel(volume, size=20)
    axes[2].set_xticks(ticks)

# plt.savefig('top_3_review_dist_bar')
;