In [None]:
import pandas as pd
import numpy as np
from config import TOKEN
%run data_load.py

if final_json_data:
    for dict in final_json_data:
        # flatten the data
        bus_row = pd.json_normalize(dict) 
        
        # append the new row to data frame
        businesses = pd.concat([businesses, bus_row])
if not businesses.empty:
    # yelp likes to return duplicates 
    businesses = businesses.drop_duplicates(subset = ['id'])

    # make a new row for each dictionary in the categories col
    bus_exploded = businesses.explode('categories').reset_index(drop=True)

    # encode all the information into new binary categorical columns 
    bus_encoded = pd.get_dummies(bus_exploded['categories'].apply(pd.Series))

    # concat the new columns to the exploded dataframe so that the rows match
    bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

    # change all column names to string
    bus_final.columns = bus_final.columns.map(str)

    # drop the titles
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('title')]

    # need to make the rows unique and get the sum of alias cols by partitioning by business id 
    grouped = bus_final.groupby('id')

    # Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
    for col in bus_final.columns[25:]:
        bus_final[col[6:]] = grouped[col].transform('sum')
    bus_final = bus_final.drop_duplicates(subset = ['id'])



    # make a new row for each dictionary in the transaction col
    bus_exploded = bus_final.explode('transactions').reset_index(drop=True)

    # encode all the information into new binary categorical columns 
    bus_encoded = pd.get_dummies(bus_exploded['transactions'].apply(pd.Series))

    # concat the new columns to the exploded dataframe so that the rows match
    bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

    grouped = bus_final.groupby('id')

    # Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
    for col in bus_final.columns[-3:]:
        bus_final[col[2:]] = grouped[col].transform('sum')
    bus_final = bus_final.drop_duplicates(subset = ['id'])


    # clean up
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('alias')]
    bus_final = bus_final.drop(columns=['categories', 'location.state', 'location.country', 'location.display_address'])
    bus_final = bus_final.drop(columns=['transactions'])
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('0_')]

if not bus_final.empty:
    bus_copy = bus_final
    # feature engineering

    # encode the ratings 
    bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['rating'])], axis=1)
    # encode the neighborhoods
    bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['neighborhood'])], axis=1)
    # encode the price options to different cols
    bus_copy = pd.concat([bus_copy, pd.get_dummies(bus_copy['price'])], axis=1)

    bus_copy.columns = bus_copy.columns.map(str)
    
    # change all empty values to nan
    bus_copy = bus_copy.replace('', np.nan)

    # has image from image_url
    bus_copy['has_image'] = np.where(bus_copy['image_url'].isna(), 0, 1)

    # has_phone from phone
    bus_copy['has_phone'] = np.where(bus_copy['phone'].isna(), 0, 1) 

    # has_st_add from location.address1
    bus_copy['has_st_add'] = np.where(bus_copy['location.address1'].isna(), 0, 1) 

    # has_price from price
    bus_copy['has_price'] = np.where(bus_copy['price'].isna(), 0, 1) 

    
    # need to remove the businesses that have a review count of zero as these businesses will impact our analysis of what makes a good restaurant
    bus_copy = bus_copy[bus_copy['review_count'] > 0]

    # Calculate a Balanced Rating Score (BRS)
    weight_average_rating = 0.7
    weight_review_count = 0.3

    # Normalize Average Rating
    bus_copy['norm_rating'] = bus_copy['rating'] / 5

    # Normalize Review Count using logarithm and min-max scaling
    bus_copy['norm_count'] = np.log10(bus_copy['review_count'] + 0.000000001)
    bus_copy['norm_count'] = (bus_copy['norm_count'] - bus_copy['norm_count'].min()) / (bus_copy['norm_count'].max() - bus_copy['norm_count'].min())

    bus_copy['brs'] = (weight_average_rating * bus_copy['norm_rating']) + (weight_review_count * bus_copy['norm_count'])

    # cols to drop and rename
    bus_copy = bus_copy.drop(columns=['image_url', 'is_closed', 'url', 'norm_count', 'norm_rating', 'price', 'phone', 'display_phone', 'location.address1', 'location.address2', 'location.address3'])
    bus_copy = bus_copy.rename(columns={'coordinates.latitude': 'latitude', 'coordinates.longitude': 'longitude', 'location.city': 'city', 'location.zip_code': 'zip_code'})

   
    # would like to make a high and low rating label to simplify the classification
    bus_copy['high_brs'] = np.where(bus_copy['brs'] >= bus_copy['brs'].quantile(0.75), 1, 0)

    # rename the price columns to avoid python text errors
    bus_copy.rename(
    columns={'$': 'affordable_eats', '$$': 'mid-range_dining', '$$$': 'upscale_dining', '$$$$': 'gourmet_experience'},
    inplace=True,
    )
    


# after all of this is done check all columns for nan or None vlaues or other not allowed values
# verify the location is within Boston
# see distances
# add a neighborhood column
# check that zipcode belongs to boston
# exclude 02467 as it is chestnut hill and this would interfere with the brighton search, this leaves out two restaurants technically in boston
# exclude 02151 beachmont

    na_values = (bus_copy.isna().any())