In [82]:
import requests
import json
import pandas as pd
from flatten_json import flatten

from config import TOKEN

url = 'https://api.yelp.com/v3/businesses/search?location=Allston%2C%20Boston&term=restaurants&sort_by=distance&limit=50&offset=250'

headers = {
    'accept': 'application/json',
    'Authorization': 'Bearer ' + TOKEN
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    json_data = response.json()
else:
    # Handle the API request error here
    print('Failed to retrieve data from the API.')
    json_data = None

In [83]:
if json_data:
    businesses = pd.DataFrame()

    for dict in json_data['businesses']:
        # flatten the data
        bus_row = pd.json_normalize(dict) 
        
        # append the new row to data frame
        businesses = pd.concat([businesses, bus_row])


In [84]:
# first need to join all the data from neighborhood sampling so we can perform the code below (keep unique, calculate how many overlap)
ZIPS = {'east boston':['02128'], 'charlestown': ['02129'], 'allston': ['02163','02134'], 'brighton':['02135'], 'beacon Hill': ['02108'], 
        'back bay': ['02116', '02199'],'chinatown-leather district': ['02111'], 'dorchester': ['02121', '02122', '02124', '02125'], 'fenway-kenmore': ['02115', '02215'],
          'hyde park': ['02136'], 'jamaica-plain': ['02130'], 'mattapan': ['02126'], 'mission hill': ['02120'],'north end': ['02113', '02109'], 'roslindale': ['02131'], 
          'roxbury': ['02119'], 'south boston': ['02127', '02210'], 'south end': ['02118'], 'west end': ['02114'], 'west roxbury': ['02132'], 'wharf district': ['02110'],
           'downtown': ['02203', '02201']}

# make a new row for each dictionary in the categories col
bus_exploded = businesses.explode('categories').reset_index(drop=True)

# encode all the information into new binary categorical columns 
bus_encoded = pd.get_dummies(bus_exploded['categories'].apply(pd.Series))

# concat the new columns to the exploded dataframe so that the rows match
bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

# drop the titles
bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('title')]

# need to make the rows unique and get the sum of alias cols by partitioning by business id 
grouped = bus_final.groupby('id')

# Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
for col in bus_final.columns[24:]:
    bus_final[col[6:]] = grouped[col].transform('sum')

bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('alias')]
bus_final = bus_final.drop(columns=['categories', 'location.state', 'location.country', 'location.display_address'])

bus_final = bus_final.drop_duplicates(subset = ['id'])




# make a new row for each dictionary in the transaction col
bus_exploded = bus_final.explode('transactions').reset_index(drop=True)

# encode all the information into new binary categorical columns 
bus_encoded = pd.get_dummies(bus_exploded['transactions'].apply(pd.Series))

# concat the new columns to the exploded dataframe so that the rows match
bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

grouped = bus_final.groupby('id')

# Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
for col in bus_final.columns[-3:]:
    bus_final[col[2:]] = grouped[col].transform('sum')

bus_final = bus_final.drop(columns=['transactions'])
bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('0_')]
bus_final = bus_final.drop_duplicates(subset = ['id'])



In [85]:
import numpy as np
bus_copy = bus_final
# feature engineering

# removes columns that have a zip code outside of target neighborhood
bus_copy = bus_copy[(bus_copy['location.zip_code'].isin(ZIPS['allston']))]

# change all empty values to nan
bus_copy = bus_copy.replace('', np.nan)

# has image from image_url
bus_copy['has_image'] = np.where(bus_copy['image_url'].isna(), 0, 1)

# double check that the na values aren't being counted here
# has_phone from phone
bus_copy['has_phone'] = np.where(bus_copy['phone'].isna(), 0, 1) 

# has_st_add from location.address1
bus_copy['has_st_add'] = np.where(bus_copy['location.address1'].isna(), 0, 1) 

# has_price from price
bus_copy['has_price'] = np.where(bus_copy['price'].isna(), 0, 1) 

# encode the price options to different cols
bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['price'])], axis=1)

# Calculate a Balanced Rating Score (BRS)
weight_average_rating = 0.7
weight_review_count = 0.3

# Normalize Average Rating
bus_copy['norm_rating'] = bus_copy['rating'] / 5

# Normalize Review Count using logarithm and min-max scaling
bus_copy['norm_count'] = np.log10(bus_copy['review_count'] + 0.000000001)
bus_copy['norm_count'] = (bus_copy['norm_count'] - bus_copy['norm_count'].min()) / (bus_copy['norm_count'].max() - bus_copy['norm_count'].min())

bus_copy['brs'] = (weight_average_rating * bus_copy['norm_rating']) + (weight_review_count * bus_copy['norm_count'])

# cols to drop and rename
bus_copy = bus_copy.drop(columns=['image_url', 'is_closed', 'url', 'norm_count', 'norm_rating', 'price', 'phone', 'display_phone', 'location.address1', 'location.address2', 'location.address3'])
bus_copy = bus_copy.rename(columns={'coordinates.latitude': 'latitude', 'coordinates.longitude': 'longitude', 'location.city': 'city', 'location.zip_code': 'zip_code'})





# after all of this is done check all columns for nan or None vlaues or other not allowed values
# verify the location is within Boston
# see distances
# add a neighborhood column
# check that zipcode belongs to boston
# exclude 02467 as it is chestnut hill and this would interfere with the brighton search, this leaves out two restaurants technically in boston
# exclude 02151 beachmont

na_values = (bus_copy.isna().any())

# make neighborhood columns for each iteration of a loop
bus_copy['allston'] = 1


