In [2]:
import json
import sys
import pandas as pd
from matplotlib import pyplot as plt
import requests
from keys import client_id, api_key

In [108]:
# SEARCH PARAMETERS
url =  "https://api.yelp.com/v3/businesses/search"
term = 'Indian'
location = 'Washington, D.C.'
categories = "restaurants"
radius = 100

# HEADERS
headers = {'Authorization': 'Bearer {}'.format(api_key),
    }

# URL PARAMETERS
url_params = {"term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "limit": 50,
                }

# RESPONSE FROM API SERVER
response = requests.get(url, headers = headers, params = url_params)

In [109]:
def yelp_call(url, url_params, api_key):
#THIS FUNCTION IS CALLED TO MAKE API CALL TO YELP AND RETURN THE RESPONSE

    headers = {'Authorization': 'Bearer {}'.format(api_key),}

    response = requests.get(url, headers = headers, params = url_params)
#    response.status_code
#    response.text

    return response

In [119]:
def parse_data(list_of_data):
#THIS FUNCTION CREATES A TUPLE OF BUSINESS INFORMATION FOR ALL BUSINESSES CAPTURED VIA YELP
# INCLUDES CONVERSION FOR CHANGING "$$" TO NUM FOR PRICE REPRESENTATION
    businesses=[]
    
    for business in list_of_data:
        get_categories = parse_categories(business['categories'])    
        num_categories = len(get_categories)
        if 'price' in business.keys():
            price_val = len(business['price'])
        else:
            price_val = 0
        biz_tuple = (business['id'], business['name'], get_categories, num_categories, price_val, business['review_count'], business['rating'], business['location']['address1'], 
                     business['location']['city'], business['location']['state'], business['location']['zip_code'])
        businesses.append(biz_tuple)

    return businesses

In [120]:
def parse_results(results):
# THIS FUNCTION TAKES RESULTS RETURNED FROM YELP API, PARSES THE DATA, AND STORES AS A DATAFRAME

    #EITHER ONE WORKS
    #data = json.loads(response.text)
    data = results.json()
    num = data['total']
    
    parsed_results = parse_data(data['businesses'])
    df = pd.DataFrame(parsed_results, columns=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state', 'zip_code'])
    return df, num

In [121]:
def parse_categories(categories_list):
# THIS FUNCTION TAKES THE LIST OF CATEGORIES FROM A BUSINESS AND CREATES A LIST OF CATEGORIES OF THE BUSINESS

    list_of_categories = []
    x_len = len(categories_list)
    for x in range(0,x_len):
        list_of_categories.append(categories_list[x]['alias'])
    return list_of_categories

In [122]:
def df_save(csv_filepath, parsed_results):
# THIS FUNCTION SAVES A DATAFRAME AS A CSV FILE
    
    f = open(csv_filepath, 'a', encoding="utf-8")
    parsed_results.to_csv(f, header=False)
    f.close()

In [128]:
def get_yelp_reviews(business_id):
# THIS FUNCTION CALLS YELP API TO RETRIEVE ALL REVIEWS FOR A GIVEN BUSINESS ID

    business_reviews = []
    headers = {'Authorization': 'Bearer {}'.format(api_key),}
    url =  "https://api.yelp.com/v3/businesses/{}/reviews".format(business_id)

    review_response = requests.get(url, headers = headers)
    review_data = json.loads(review_response.text)
        
    for review in review_data['reviews']:
        review_tuple = (business_id, review['id'], review['text'], review['rating'], review['time_created'])
        business_reviews.append(review_tuple)

    return business_reviews

In [143]:
def generate_all_reviews(businesses_list):
# THIS FUNCTION IS CALLED TO CREATE A LIST OF REVIEWS FOR ALL BUSINESSES WITHIN ITS PASSED DATA SET

    all_business_reviews = []

    for biz in businesses_list:
        biz_reviews = get_yelp_reviews(biz)
        for each_business in biz_reviews:
            all_business_reviews.append(each_business)

    reviews_results = pd.DataFrame(all_business_reviews, columns = 
                                   ['business_id', 'review_id', 'text', 'rating', 'time_created'])
     
    return reviews_results

In [147]:
def generate_all_yelp_data_la():
# THIS FUNCTION IS THE MAIN DRIVER - SETS SEARCH PARAMETERS, CALLS YELP API, 
# PARSES RETURNED DATA AND STORED IN CSV/DATAFRAME

    term = 'Indian'
    location = 'Los Angeles'
    categories = "restaurants"
    radius = 40000
#   price = 4
    
    url =  "https://api.yelp.com/v3/businesses/search"
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
#                "price": price,
                }

    all_results_la = pd.DataFrame()
    num = 1
    cur = 0
    
    while (cur < num and cur < 1000):

        url_params['offset'] = cur
        results = yelp_call(url, url_params, api_key)
        parsed_results, num = parse_results(results)
        parsed_reviews = generate_all_reviews(parsed_results.business_id)
        df_save('test_data_la.csv', parsed_results)
        df_save('test_review_la.csv', parsed_reviews)
        all_results_la = all_results_la.append(parsed_results)
        cur += 50

    return all_results_la

In [148]:
def generate_all_yelp_data_dc():
# THIS FUNCTION IS THE MAIN DRIVER - SETS SEARCH PARAMETERS, CALLS YELP API, 
# PARSES RETURNED DATA AND STORED IN CSV/DATAFRAME

    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 40000
#   price = 4
    
    url =  "https://api.yelp.com/v3/businesses/search"
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
#                "price": price,
                }

    all_results_dc = pd.DataFrame()
    num = 1
    cur = 0
    
    while (cur < num and cur < 1000):

        url_params['offset'] = cur
        results = yelp_call(url, url_params, api_key)
        parsed_results, num = parse_results(results)
        parsed_reviews = generate_all_reviews(parsed_results.business_id)
        df_save('test_data_dc.csv', parsed_results)
        df_save('test_review_dc.csv', parsed_reviews)
        all_results_dc = all_results_dc.append(parsed_results)
        cur += 50

    return all_results_dc

In [16]:
indian_dc = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\indian_dc.csv', converters={'categories': eval}, header=None,
                        names=['business_id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
indian_dc.head(3)

Unnamed: 0,business_id,name,categories,num_categories,price,review_count,rating,street_address,city,state
0,n-6O6I7pmmpwkW2pCO-zDw,Indigo,[indpak],1,2,876,4.5,243 K St NE,"Washington, DC",DC
1,jj1vSlzf-G1_vTJJ6ftsAA,Pappe,[indpak],1,2,304,4.0,1317 14th St NW,"Washington, DC",DC
2,CwdlygqT4cWwOtQGsYdoBw,Rasika,[indpak],1,3,3126,4.5,633 D St NW,"Washington, DC",DC


In [14]:
reviews_dc = pd.read_csv('C:\\Users\\User\\Documents\\Flatiron\\Repository\\flatiron-phase1-yelp\\data\\reviews_dc.csv',
                        names=['business_id', 'review_id', 'text', 'rating', 'time_created'], index_col=0)
reviews_dc.head(3)

Unnamed: 0,business_id,review_id,text,rating,time_created
0,n-6O6I7pmmpwkW2pCO-zDw,RJ9WlG7xvv8Yki9QUx1hgQ,My tip for staying sane during the quarantine?...,5,2020-04-14 09:10:48
1,n-6O6I7pmmpwkW2pCO-zDw,-mKGFkpFQ7fRGcDIdK3pVA,I remember when Indigo first opened as a stand...,5,2020-03-29 07:36:03
2,n-6O6I7pmmpwkW2pCO-zDw,ZwPkQ1pTYvmtrebgPZXa6Q,Indigo offered awesome comfort food during thi...,5,2020-03-22 19:31:49


In [19]:
def part_4_questions_answers(indian_dc, reviews_dc):
# THIS FUNCTION IS CALLED TO DISPLAY THE PART 4 Q&A RESULTS
    
    most_reviewed_businesses = indian_dc.sort_values('review_count',ascending=False)[0:5][['name','review_count']]
    highest_rating = indian_dc['rating'].max()
    high_rated_bus_count = len(indian_dc.loc[indian_dc['rating']==highest_rating]) #results_dataframe['rating'].max()])
    bus_percent_by_rating = indian_dc.groupby('rating').count().apply(lambda x: 100*x/x.sum(), axis=0).business_id
    bus_percent_by_price = indian_dc.groupby('price').count().apply(lambda x: 100*x/x.sum(), axis=0).business_id
    #results_dataframe.sort_values('review_count',ascending=False)[0:1]['business_id'][1]
    #most_reviewed_text = reviews_dc.loc[reviews_dc['business_id']==indian_dc.sort_values(by='review_count', ascending=False).head(1)['business_id'][1]].text
    max_rated_reviewed_biz = indian_dc.loc[indian_dc['rating']==indian_dc['rating'].max()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    max_rated_review_text = indian_dc.loc[indian_dc['business_id']==max_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text
    min_rated_reviewed_biz = indian_dc.loc[indian_dc['rating']==indian_dc['rating'].min()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    min_rated_review_text = reviews_dc.loc[indian_dc['business_id']==min_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text

    print("1. Top 5 Most reviewed businesses are:")
    print(most_reviewed_businesses)
    print()
    print("2. Number of businesses with highest rating of {}: {}".format(highest_rating, high_rated_bus_count))
    print()
    print("3&4. Percent of businesses by Rating: {}".format(bus_percent_by_rating))
    print()
    print("5. Percent of businesses by Price: {}".format(bus_percent_by_price))
    print()
    print("6. Sample reviews of most reviewed business:")
    #print(most_reviewed_text)
    print()
    print("7. Latest review of highest rated, highest reviewed business:")
    print(max_rated_review_text)
    print()
    print("8. Latest review of lowest rated, lowest reviewed business:")
    print(min_rated_review_text)


In [20]:
part_4_questions_answers(indian_dc, reviews_dc)

KeyError: 'time_created'