In [1]:
import json
import sys
import pandas as pd
from keys import client_id, api_key
import requests



In [16]:
def generate_yelp_data():
# THIS FUNCTION IS USED AS A TESTING GROUND - RUN THIS FUNCTION TO GRAB ONE SET OF COUNT=50 SET OF DATA ONLY

    url =  "https://api.yelp.com/v3/businesses/search"
    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 1000

    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
                }
    
    results = yelp_call(url, url_params, api_key)
    parsed_results, num = parse_results(results)
    parsed_reviews = generate_all_reviews(parsed_results.id)
    return parsed_results, parsed_reviews
    
def generate_all_yelp_data():
# THIS FUNCTION IS THE MAIN DRIVER - SETS SEARCH PARAMETERS, CALLS YELP API, PARSES RETURNED DATA AND STORED IN CSV/DATAFRAME

    term = 'Indian'
    location = 'Washington, D.C.'
    categories = "restaurants"
    radius = 1000
    price = 4
    
    url =  "https://api.yelp.com/v3/businesses/search"
    url_params = {
                "term": term.replace(' ', '+'),
                "location": location.replace(' ', '+'),
                "categories" : categories,
                "radius" : radius,
                "limit": 50,
#                "price": price,
                }

    all_results = pd.DataFrame()
    num = 1
    cur = 0
    
    while (cur < num and cur < 1000):

        url_params['offset'] = cur
        results = yelp_call(url, url_params, api_key)
        parsed_results, num = parse_results(results)
        parsed_reviews = generate_all_reviews(parsed_results.id)
        df_save('data/test_data.csv', parsed_results)
        df_save('data/test_reviews.csv', parsed_reviews)
        all_results = all_results.append(parsed_results)
        cur += 50

    return all_results

def generate_all_reviews(businesses_list):
# THIS FUNCTION IS CALLED TO CREATE A LIST OF REVIEWS FOR ALL BUSINESSES WITHIN ITS PASSED DATA SET

    all_business_reviews = []

    for biz in businesses_list:
        biz_reviews = get_yelp_reviews(biz)
        for each_business in biz_reviews:
            all_business_reviews.append(each_business)

    reviews_results = pd.DataFrame(all_business_reviews, columns=['business_id', 'review_id', 'text', 'rating', 'time_created'])
     
    return reviews_results
    
def get_yelp_reviews(business_id):
# THIS FUNCTION CALLS YELP API TO RETRIEVE ALL REVIEWS FOR A GIVEN BUSINESS ID

    business_reviews = []
    headers = {'Authorization': 'Bearer {}'.format(api_key),}
    url =  "https://api.yelp.com/v3/businesses/{}/reviews".format(business_id)

    review_response = requests.get(url, headers=headers)
    review_data = json.loads(review_response.text)
        
    for review in review_data['reviews']:
        review_tuple = (business_id, review['id'], review['text'], review['rating'], review['time_created'])
        business_reviews.append(review_tuple)

    return business_reviews

def yelp_call(url, url_params, api_key):
#THIS FUNCTION IS CALLED TO MAKE API CALL TO YELP AND RETURN THE RESPONSE

    headers = {'Authorization': 'Bearer {}'.format(api_key),}

    response = requests.get(url, headers=headers, params=url_params)
#    response.status_code
#    response.text

    return response

def parse_data(list_of_data):
#THIS FUNCTION CREATES A TUPLE OF BUSINESS INFORMATION FOR ALL BUSINESSES CAPTURED VIA YELP

    businesses=[]
    
    for business in list_of_data:
        get_categories = parse_categories(business['categories'])    
        num_categories = len(get_categories)
        if 'price' in business.keys():
            price_val = len(business['price'])
        else:
            price_val = 0
        biz_tuple = (business['id'], business['name'], get_categories, num_categories, price_val, business['review_count'], business['rating'], business['location']['address1'], business['location']['city'], business['location']['state'])
        businesses.append(biz_tuple)

    return businesses

def parse_results(results):
# THIS FUNCTION TAKES RESULTS RETURNED FROM YELP API, PARSES THE DATA, AND STORES AS A DATAFRAME

    #EITHER ONE WORKS
    #data = json.loads(response.text)
    data = results.json()
    num = data['total']
    
    parsed_results = parse_data(data['businesses'])
    df = pd.DataFrame(parsed_results, columns=['id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])
    return df, num

def parse_categories(categories_dict):
# THIS FUNCTION TAKES THE CATEGORIES VALUE FROM A BUSINESS AND CREATES A LIST OF CATEGORIES OF THE BUSINESS

    list_of_categories = []
    x_len = len(categories_dict)
    for x in range(0,x_len):
        list_of_categories.append(categories_dict[x]['alias'])
    return list_of_categories

def df_save(csv_filepath, parsed_results):
# THIS FUNCTION SAVES A DATAFRAME AS A CSV FILE
    
    f = open(csv_filepath, 'a', encoding="utf-8")
    parsed_results.to_csv(f, header=False)
    f.close()

def part_4_quesions_answers(results_dataframe, reviews_dataframe)

    most_reviewed_businesses = results_dataframe.sort_values('review_count',ascending=False)[0:5][['business_id','name','review_count']]
    high_rated_bus_count = len(results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].max()])
    bus_percent_by_rating = results_dataframe.groupby('rating').count().apply(lambda x: x/x.sum(), axis=0).business_id
    bus_percent_by_price = results_dataframe.groupby('price').count().apply(lambda x: x/x.sum(), axis=0).business_id
    #results_dataframe.sort_values('review_count',ascending=False)[0:1]['business_id'][1]
    most_reviewed_text = reviews_dataframe.loc[reviews_dataframe['business_id']==results_dataframe.loc[results_dataframe['review_count'].idxmax()]['business_id']].text

    max_rated_reviewed_biz = results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].max()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    max_rated_review_text = reviews_dataframe.loc[reviews_dataframe['business_id']==max_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text
    min_rated_reviewed_biz = results_dataframe.loc[results_dataframe['rating']==results_dataframe['rating'].min()].sort_values(by='review_count', ascending=False).head(1).values[0][0]
    min_rated_review_text = reviews_dataframe.loc[reviews_dataframe['business_id']==min_rated_reviewed_biz].sort_values(by='time_created', ascending=False).head(1).text

    print("Most reviewed business is: {}".format(most_reviewed_businesses))
    print("Num businesses with highest rating: {}".format(high_rated_bus_count))
    print("Percent of businesses by Rating: {}".format(bus_percent_by_rating))
    print("Percent of businesses by Price: {}".format(bus_percent_by_price))
    print("Sample review of most reviewed business: {}".format(most_reviewed_text))
    print("Latest review of highest rated, highest reviewed business: {}".format(max_rated_review_text))
    print("Latest review of lowest rated, lowest reviewed business: {}".format(min_rated_review_text))   
    

In [17]:
# CALL THIS LINE TO GENERATE ALL YELP DATA BASED ON CRITERIA ABOVE AND STORE INTO CSV
all_results = generate_all_yelp_data()

In [19]:
# CALL THESE LINES TO LOAD AN EXISTING CSV DATA SET SAVED IN DIRECTORY AND PLAY WITH IT
all_results = pd.read_csv('data/hvac_houston.csv', converters={'categories': eval}, header=None, names=['id', 'name', 'categories', 'num_categories', 'price', 'review_count', 'rating', 'street_address', 'city', 'state'])


In [31]:
all_results.groupby(['rating'])['review_count','num_categories'].agg(['count', 'mean','median'])

Unnamed: 0_level_0,review_count,review_count,review_count,num_categories,num_categories,num_categories
Unnamed: 0_level_1,count,mean,median,count,mean,median
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1.0,32,1.53125,1.0,32,1.96875,2
1.5,3,4.0,3.0,3,3.0,3
2.0,10,3.9,3.5,10,2.5,3
2.5,7,4.142857,3.0,7,2.285714,3
3.0,26,4.269231,2.0,26,2.346154,3
3.5,26,5.807692,5.0,26,2.076923,2
4.0,12,6.916667,5.5,12,2.25,3
4.5,20,17.7,11.5,20,2.15,3
5.0,107,3.579439,2.0,107,2.35514,3


In [52]:
all_results.sort_values(by='review_count', ascending=False)

Unnamed: 0,id,name,categories,num_categories,price,review_count,rating,street_address,city,state
0,ObeexiHvTJ9OomAnMTIY4g,Lebedin Kofman,"[duilawyers, criminaldefense, divorce]",3,0,88,4.5,26 Broadway,New York,NY
1,PBi4Ro3954ZDZvfYrY4btA,Spodek Law Group,"[divorce, criminaldefense]",2,0,56,5.0,85 Broad St,New York,NY
3,dJt8s35t8eTiPVXze1NCeg,"Law Offices of Deborah G Fiss, Esq.",[divorce],1,0,42,4.5,37-06 82nd St,Jackson Heights,NY
2,bmNdMOltxH52KJ8VxSdM_g,"Law Offices of Mindin & Mindin, P C","[divorce, general_litigation, legalservices]",3,0,37,5.0,61 Broadway,New York,NY
14,7a9KjUg0cmxJDQDjoXfI3w,Bhatt Law Group,"[divorce, criminaldefense, personal_injury]",3,0,35,4.5,378 Summit Ave,Jersey City,NJ
...,...,...,...,...,...,...,...,...,...,...
20,gGGUpudvgwMvtNB2gRkrqA,J.J Borer ESQ,[divorce],1,0,1,5.0,26 Court St,Brooklyn,NY
18,Gfb9p1-JCzbQt5DoivaH_w,Law Office of Rong Kohtz,"[divorce, immigrationlawyers]",2,0,1,5.0,30 Wall St,New York,NY
17,MGBfQA4mR6GNGlCc30Jnhw,Clover Barrett & Associates,"[bankruptcy, divorce, estateplanning]",3,0,1,5.0,338 Atlantic Ave,Brooklyn,NY
16,msfPq-lBjGbfUK12Yp7WWg,Maryam Jahedi Law Firm,"[criminaldefense, duilawyers, divorce]",3,0,1,5.0,65 Broadway,New York,NY


In [15]:
all_results.describe()

NameError: name 'all_results' is not defined

In [9]:
# CALL THIS FUNCTION TO TEST FUNCTIONALITY TO GRAB/STORE RETURNED YELP DATA FOR ONE SET OF 50 ONLY
new_data, new_reviews = generate_yelp_data()


In [9]:
#new_reviews
f = open('test_data.csv', 'a', encoding="utf-8")
new_data.to_csv(f, header=False)
f.close()

In [10]:
new_reviews

Unnamed: 0,business_id,review_id,text,rating,time_created
0,jj1vSlzf-G1_vTJJ6ftsAA,_SwrCl0Jfu79wy6WyOFRSQ,"4.5 stars, rounding up because it survived del...",5,2020-09-13 18:33:22
1,jj1vSlzf-G1_vTJJ6ftsAA,p3AGHJbDCgiEU-cLkP9EWA,This is a really cute joint that's fitting for...,5,2021-02-01 19:13:42
2,jj1vSlzf-G1_vTJJ6ftsAA,tsZZlIEWaJ61p6WrjDbvhg,I remember visiting this place on the night of...,5,2020-04-22 21:28:14
3,fqdtzS_KkBk7ugtd9t7KtQ,Ynfcw_GAUguE3JWsUMYajQ,"The food was phenomenal, and that is coming fr...",5,2020-12-11 16:56:57
4,fqdtzS_KkBk7ugtd9t7KtQ,j0XYjDEhkYQPkw2W6sohQw,Ordered delivery last month and completely for...,4,2020-08-19 20:56:55
5,fqdtzS_KkBk7ugtd9t7KtQ,Q7mJ6XsNvphVZcfz_BkFXA,"I've only ordered takeout from Sakina, but the...",5,2020-11-19 10:31:45
6,3_6iqAAM2UYGsjKLP6fSoQ,cT0iMXg9_4oZGJqbcLiMHw,OH MY GOD!!!! \n\nI had the Lamb Gosi (it's in...,5,2019-08-17 17:15:10
7,3_6iqAAM2UYGsjKLP6fSoQ,7BfZUcgQzBWIYtYbyOxeDg,Came here for lunch on a Friday. As a group we...,2,2019-09-06 10:06:31
8,3_6iqAAM2UYGsjKLP6fSoQ,M0fzXbiG885v0exoGxRxig,I loved this little spot in DC. Indian fare is...,5,2019-07-10 12:30:33
9,jDUSh15O6gLtE4yRtHclUA,arfeuDAk2QUPpoAh7N-uLg,We order delivery from sacrificial lamb almost...,5,2020-12-20 01:54:42


In [27]:
all_results.groupby(['price'])['review_count','rating'].agg(['count', 'mean','median'])

Unnamed: 0_level_0,review_count,review_count,review_count,rating,rating,rating
Unnamed: 0_level_1,count,mean,median,count,mean,median
price,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,229,19.0,8,229,3.906114,4.0
1,245,159.040816,70,245,3.761224,4.0
2,504,258.077381,167,504,3.771825,4.0
3,20,445.4,417,20,3.825,4.0
4,2,420.0,420,2,4.25,4.25


In [11]:
all_results.describe()

Unnamed: 0,num_categories,price,review_count,rating
count,1000.0,1000.0,1000.0,1000.0
mean,2.035,1.321,183.135,3.802
std,0.858202,0.853632,391.724634,0.692297
min,1.0,0.0,1.0,1.0
25%,1.0,1.0,19.0,3.5
50%,2.0,2.0,88.0,4.0
75%,3.0,2.0,224.25,4.125
max,4.0,4.0,9545.0,5.0
