In [1]:
import requests
import json
import pandas as pd
from flatten_json import flatten

from config import TOKEN

url = 'https://api.yelp.com/v3/businesses/search?location=Allston%2C%20Boston&term=restaurants&sort_by=distance&limit=50&offset=0'

headers = {
    'accept': 'application/json',
    'Authorization': 'Bearer ' + TOKEN
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    json_data = response.json()
else:
    # Handle the API request error here
    print('Failed to retrieve data from the API.')
    json_data = None

In [None]:
if json_data:
    businesses = pd.DataFrame()

    for dict in json_data['businesses']:
        # flatten the data
        bus_row = pd.json_normalize(dict) 
        
        # append the new row to data frame
        businesses = pd.concat([businesses, bus_row])

    print(businesses.head)

In [52]:
# first need to join all the data from neighborhood sampling so we can perform the code below (keep unique, calculate how many overlap)

# make a new row for each dictionary in the categories col
bus_exploded = businesses.explode('categories').reset_index(drop=True)

# encode all the information into new binary categorical columns 
bus_encoded = pd.get_dummies(bus_exploded['categories'].apply(pd.Series))

# concat the new columns to the exploded dataframe so that the rows match
bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

# drop the titles
bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('title')]

# need to make the rows unique and get the sum of alias cols by partitioning by business id 
grouped = bus_final.groupby('id')

# Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
for col in bus_final.columns[24:]:
    bus_final[col[6:]] = grouped[col].transform('sum')

bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('alias')]
bus_final = bus_final.drop(columns=['categories', 'location.display_address'])

bus_final = bus_final.drop_duplicates(subset = ['id'])




# make a new row for each dictionary in the transaction col
bus_exploded = bus_final.explode('transactions').reset_index(drop=True)

# encode all the information into new binary categorical columns 
bus_encoded = pd.get_dummies(bus_exploded['transactions'].apply(pd.Series))

# concat the new columns to the exploded dataframe so that the rows match
bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

grouped = bus_final.groupby('id')

# Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
for col in bus_final.columns[-3:]:
    bus_final[col[2:]] = grouped[col].transform('sum')

bus_final = bus_final.drop(columns=['transactions'])
bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('0_')]
bus_final = bus_final.drop_duplicates(subset = ['id'])



In [None]:
import numpy as np

# feature engineering
# has image from image_url
# ranking that is a average of rating and review_count
# has_phone from phone
# maybe keep distance to verify the sampling
# has_st_add from location.address1
# verify the location is within Boston
# has_price from price
# encode the price options to different cols
# change the names of the location cols
bus_final['has_image'] = np.where(bus_final['image_url'] != '', 1, 0)



# cols to drop
# image url 
# is_closed
# url
# phone
# display_phone
# location.address1
# location.address2
# location.address3




