In [None]:
import requests
import pandas as pd
import numpy as np
import time
from config import TOKEN
    
businesses = pd.DataFrame()
final_json_data = []
ZIPS = {'East Boston':['02128'], 'Charlestown': ['02129'], 'Allston': ['02163','02134'], 'Brighton':['02135'], 'Beacon Hill': ['02108'], 
        'Back Bay': ['02116', '02199'],'Chinatown': ['02111'], 'Dorchester': ['02121', '02122', '02124', '02125'], 'Fenway': ['02115', '02215'],
          'Hyde Park': ['02136'], 'Jamaica Plain': ['02130'], 'Mattapan': ['02126'], 'Mission Hill': ['02120'],'North End': ['02113', '02109'], 'Roslindale': ['02131'], 
          'Roxbury': ['02119'], 'South Boston': ['02127', '02210'], 'South End': ['02118'], 'West End': ['02114'], 'West Roxbury': ['02132'], 'Wharf District': ['02110'],
           'Downtown': ['02203', '02201']}

In [None]:
calls = 0
for neighborhood, zip_codes in ZIPS.items():
    for code in zip_codes:

        offset = 0
        add_count = 1

        while add_count > 0 and offset < 1000:
            add_count = 0
            # grab data from the api
            url = 'https://api.yelp.com/v3/businesses/search?location=Boston%2C%20MA%2C%' + code + '&term=restaurants&sort_by=distance&limit=50&offset=' + str(offset)
            headers = {
                        'accept': 'application/json',
                        'Authorization': 'Bearer ' + TOKEN
                    }

            response = requests.get(url, headers=headers)

            # Handle the API request error here
            if response.status_code == 200:
                json_data = response.json()
            else:
                print('Failed to retrieve data from the API.')
                json_data = None

            # if businesses - location - zip_code not in the list of neighborhood do not add
            # may need to use a loop to drop the index within json data
            business_list = json_data['businesses']
            for bus in business_list:
                if bus['location']['zip_code'] == code:
                    bus['neighborhood'] = neighborhood
                    final_json_data.append(bus)
                    add_count += 1


            # increase the offset
            offset += 50
            if offset > 950:
                print('EXCEEDED OFFSET LIMIT!')

            # do not call from the api too fast
            time.sleep(1)
            calls += 1

In [None]:
if final_json_data:
    for dict in final_json_data:
        # flatten the data
        bus_row = pd.json_normalize(dict) 
        
        # append the new row to data frame
        businesses = pd.concat([businesses, bus_row])


In [None]:
if not businesses.empty:
    # yelp likes to return duplicates 
    businesses = businesses.drop_duplicates(subset = ['id'])

    # make a new row for each dictionary in the categories col
    bus_exploded = businesses.explode('categories').reset_index(drop=True)

    # encode all the information into new binary categorical columns 
    bus_encoded = pd.get_dummies(bus_exploded['categories'].apply(pd.Series))

    # concat the new columns to the exploded dataframe so that the rows match
    bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

    # change all column names to string
    bus_final.columns = bus_final.columns.map(str)

    # drop the titles
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('title')]

    # need to make the rows unique and get the sum of alias cols by partitioning by business id 
    grouped = bus_final.groupby('id')

    # Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
    for col in bus_final.columns[25:]:
        bus_final[col[6:]] = grouped[col].transform('sum')
    bus_final = bus_final.drop_duplicates(subset = ['id'])



    # make a new row for each dictionary in the transaction col
    bus_exploded = bus_final.explode('transactions').reset_index(drop=True)

    # encode all the information into new binary categorical columns 
    bus_encoded = pd.get_dummies(bus_exploded['transactions'].apply(pd.Series))

    # concat the new columns to the exploded dataframe so that the rows match
    bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

    grouped = bus_final.groupby('id')

    # Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
    for col in bus_final.columns[-3:]:
        bus_final[col[2:]] = grouped[col].transform('sum')
    bus_final = bus_final.drop_duplicates(subset = ['id'])


    # clean up
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('alias')]
    bus_final = bus_final.drop(columns=['categories', 'location.state', 'location.country', 'location.display_address'])
    bus_final = bus_final.drop(columns=['transactions'])
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('0_')]

In [None]:
if not bus_final.empty:
    bus_copy = bus_final
    # feature engineering

    # encode the ratings 
    bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['rating'])], axis=1)
    # encode the neighborhoods
    bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['neighborhood'])], axis=1)
    # encode the price options to different cols
    bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['price'])], axis=1)

    bus_copy.columns = bus_copy.columns.map(str)
    
    # change all empty values to nan
    bus_copy = bus_copy.replace('', np.nan)

    # has image from image_url
    bus_copy['has_image'] = np.where(bus_copy['image_url'].isna(), 0, 1)

    # has_phone from phone
    bus_copy['has_phone'] = np.where(bus_copy['phone'].isna(), 0, 1) 

    # has_st_add from location.address1
    bus_copy['has_st_add'] = np.where(bus_copy['location.address1'].isna(), 0, 1) 

    # has_price from price
    bus_copy['has_price'] = np.where(bus_copy['price'].isna(), 0, 1) 

    

    

    # Calculate a Balanced Rating Score (BRS)
    weight_average_rating = 0.7
    weight_review_count = 0.3

    # Normalize Average Rating
    bus_copy['norm_rating'] = bus_copy['rating'] / 5

    # Normalize Review Count using logarithm and min-max scaling
    bus_copy['norm_count'] = np.log10(bus_copy['review_count'] + 0.000000001)
    bus_copy['norm_count'] = (bus_copy['norm_count'] - bus_copy['norm_count'].min()) / (bus_copy['norm_count'].max() - bus_copy['norm_count'].min())

    bus_copy['brs'] = (weight_average_rating * bus_copy['norm_rating']) + (weight_review_count * bus_copy['norm_count'])

    # cols to drop and rename
    bus_copy = bus_copy.drop(columns=['0', '', 'image_url', 'is_closed', 'url', 'norm_count', 'norm_rating', 'price', 'phone', 'display_phone', 'location.address1', 'location.address2', 'location.address3'])
    bus_copy = bus_copy.rename(columns={'coordinates.latitude': 'latitude', 'coordinates.longitude': 'longitude', 'location.city': 'city', 'location.zip_code': 'zip_code'})

   


# after all of this is done check all columns for nan or None vlaues or other not allowed values
# verify the location is within Boston
# see distances
# add a neighborhood column
# check that zipcode belongs to boston
# exclude 02467 as it is chestnut hill and this would interfere with the brighton search, this leaves out two restaurants technically in boston
# exclude 02151 beachmont

    na_values = (bus_copy.isna().any())

In [None]:
# EDA
'''
- 
- 
- Visualize the sampling per neighborhood
- Distribution of the restaurant tags
- Distribution of the has images, phone, and address
- Distribution of the prices
- Histogram of the calculated score
'''
import matplotlib.pyplot as plt
%matplotlib inline


# Distribution of review count
plt.hist(bus_copy['review_count'], color='blue', edgecolor='black')
plt.xlabel('Review Count')
plt.ylabel('No. of Restaurants')
plt.title('Log Distribution of Review Counts')
plt.yscale('log')
plt.show()

# Distribution of rating
rating_counts = bus_copy.loc[:, '0.0':'5.0'].sum()
rating_counts.plot(kind='bar', color='skyblue')
plt.title('Rating Counts')
plt.xlabel('Ratings')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()

# Count of each neighborhood
neighborhood_counts = bus_copy.loc[:, 'Allston':'West Roxbury'].sum()
neighborhood_counts.plot(kind='bar', color='skyblue')
plt.title('Neighborhood Counts')
plt.xlabel('Neighborhoods')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()







In [None]:
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
%matplotlib inline

bos_map = gpd.read_file('City_of_Boston_Boundary/City_of_Boston_Boundary.shp')

geometry = [Point(xy) for xy in zip(bus_copy['longitude'], bus_copy['latitude'])]
crs = {'init':'epsg:4326'}

geo_df = gpd.GeoDataFrame(bus_copy, # specify our data
                          crs=crs, # specify our coordinate reference system
                          geometry=geometry) # specify the geometry list we created

fig, ax = plt.subplots(figsize=(15,15))
bos_map.plot(ax=ax, alpha=0.4, color='grey')

for neighborhood in bus_copy.columns[202:222]:
    geo_df[geo_df[neighborhood] == 1].plot(ax=ax,
                                            markersize=2, 
                                            color='blue', 
                                            marker='o', 
                                            label=neighborhood)


plt.legend(prop={'size':15})


