In [None]:
%run data_load.py

In [None]:
import pandas as pd
import numpy as np
from config import TOKEN

if final_json_data:
    for dict in final_json_data:
        # flatten the data
        bus_row = pd.json_normalize(dict) 
        
        # append the new row to data frame
        businesses = pd.concat([businesses, bus_row])
if not businesses.empty:
    # yelp likes to return duplicates 
    businesses = businesses.drop_duplicates(subset = ['id'])

    # make a new row for each dictionary in the categories col
    bus_exploded = businesses.explode('categories').reset_index(drop=True)

    # encode all the information into new binary categorical columns 
    bus_encoded = pd.get_dummies(bus_exploded['categories'].apply(pd.Series))

    # concat the new columns to the exploded dataframe so that the rows match
    bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

    # change all column names to string
    bus_final.columns = bus_final.columns.map(str)

    # drop the titles
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('title')]

    # need to make the rows unique and get the sum of alias cols by partitioning by business id 
    grouped = bus_final.groupby('id')

    # Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
    for col in bus_final.columns[25:]:
        bus_final[col[6:]] = grouped[col].transform('sum')
    bus_final = bus_final.drop_duplicates(subset = ['id'])

    # make a new row for each dictionary in the transaction col
    bus_exploded = bus_final.explode('transactions').reset_index(drop=True)

    # encode all the information into new binary categorical columns 
    bus_encoded = pd.get_dummies(bus_exploded['transactions'].apply(pd.Series))

    # concat the new columns to the exploded dataframe so that the rows match
    bus_final = pd.concat([bus_exploded, bus_encoded], axis=1)

    grouped = bus_final.groupby('id')

    # Use 'transform' to calculate the sum of 'value1' and 'value2' within each group
    for col in bus_final.columns[-3:]:
        bus_final[col[2:]] = grouped[col].transform('sum')
    bus_final = bus_final.drop_duplicates(subset = ['id'])


    # clean up
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('alias')]
    bus_final = bus_final.drop(columns=['categories', 'location.state', 'location.country', 'location.display_address'])
    bus_final = bus_final.drop(columns=['transactions'])
    bus_final = bus_final.loc[:,~bus_final.columns.str.startswith('0_')]

    # feature engineering

    # encode the neighborhoods
    bus_final = pd.concat([bus_final, pd.get_dummies(bus_final['neighborhood'])], axis=1)

    bus_final.columns = bus_final.columns.map(str)

    # change all empty values to nan
    bus_final = bus_final.replace('', np.nan)

    # encode the price options to scale
    bus_final['price'].replace({'$':1, '$$':2, '$$$':3, '$$$$':4}, inplace=True)
    bus_final['price'].fillna(0, inplace=True)

    # has image from image_url
    bus_final['has_image'] = np.where(bus_final['image_url'].isna(), 0, 1)

    # has_phone from phone
    bus_final['has_phone'] = np.where(bus_final['phone'].isna(), 0, 1) 

    # has_st_add from location.address1
    bus_final['has_st_add'] = np.where(bus_final['location.address1'].isna(), 0, 1) 
    
    # need to remove the businesses that have a review count of zero as these businesses will impact our analysis of what makes a good restaurant
    bus_final = bus_final[bus_final['review_count'] > 0]

    # Calculate a Balanced Rating Score (BRS)
    weight_average_rating = 0.7
    weight_review_count = 0.3

    # Normalize Average Rating
    bus_final['norm_rating'] = bus_final['rating'] / 5

    # Normalize Review Count using logarithm and min-max scaling
    bus_final['norm_count'] = np.log10(bus_final['review_count'] + 0.000000001)
    bus_final['norm_count'] = (bus_final['norm_count'] - bus_final['norm_count'].min()) / (bus_final['norm_count'].max() - bus_final['norm_count'].min())

    bus_final['brs'] = (weight_average_rating * bus_final['norm_rating']) + (weight_review_count * bus_final['norm_count'])


    # cols to drop and rename
    bus_final = bus_final.drop(columns=['image_url', 'is_closed', 'url', 'norm_count', 'norm_rating', 'phone', 'display_phone', 'location.address1', 'location.address2', 'location.address3'])
    bus_final = bus_final.rename(columns={'coordinates.latitude': 'latitude', 'coordinates.longitude': 'longitude', 'location.city': 'city', 'location.zip_code': 'zip_code'})
    
    # check the integrity of the data
    na_values = (bus_final.isna().any())

In [None]:
import geopandas as gpd
import geoplot
import geoplot.crs as gcrs
from shapely.geometry import Point

bos_map = gpd.read_file('Boston_Neighborhoods/Boston_Neighborhoods.shp')

geometry = [Point(xy) for xy in zip(bus_final['longitude'], bus_final['latitude'])]
crs = {'init':'epsg:4326'}

geo_df = gpd.GeoDataFrame(bus_final, # specify our data
                          crs=crs, # specify our coordinate reference system
                          geometry=geometry) # specify the geometry list we created
ax = geoplot.polyplot(bos_map, projection=gcrs.AlbersEqualArea(), zorder=2, figsize=(15, 15))
geoplot.kdeplot(geo_df, cmap='Reds', thresh=0, fill=True, clip=bos_map, ax=ax)

In [None]:
geo_df.explore(
    column="neighborhood",  
    tooltip=["name", 'neighborhood', 'id', 'distance', 'zip_code'],
    popup=True,  # show all values in popup (on click)
    tiles="CartoDB positron",  # use "CartoDB positron" tiles
    cmap="Set1",  # use "Set1" matplotlib colormap
)

In [None]:
# EDA
'''
- 
- 
- Visualize the sampling per neighborhood
- Distribution of the restaurant tags
- Distribution of the has images, phone, and address
- Distribution of the prices
- Histogram of the calculated score
'''
import matplotlib.pyplot as plt
%matplotlib inline

# Will need to make these graphs more presentable

# Distribution of balanced rating score
plt.hist(bus_final['brs'], color='blue', edgecolor='black')
plt.axvline(bus_final['brs'].median(), color='k', linestyle='dashed', linewidth=1)
plt.xlabel('BRS')
plt.ylabel('No. of Restaurants')
plt.title('Distribution of BRS')
plt.show()

# Distribution of review count
plt.hist(bus_final['review_count'], color='blue', edgecolor='black')
plt.axvline(bus_final['review_count'].median(), color='k', linestyle='dashed', linewidth=1)
plt.xlabel('Review Count')
plt.ylabel('No. of Restaurants')
plt.title('Log Distribution of Review Counts')
plt.yscale('log')
plt.show()

# Distribution of rating
rating_counts = bus_final.groupby(['rating'])['rating'].count()
rating_counts.plot(kind='bar', color='skyblue')
plt.title('Rating Counts')
plt.xlabel('Ratings')
plt.ylabel('Count')
plt.xticks(rotation=90)  
plt.show()

# Count of each neighborhood
neighborhood_counts = bus_final.loc[:, 'Allston':'West Roxbury'].sum()
neighborhood_counts.plot(kind='bar', color='skyblue')
plt.title('Neighborhood Counts')
plt.xlabel('Neighborhoods')
plt.ylabel('Count')
plt.xticks(rotation=90)  
plt.show()

# Counts of prices
price_counts = bus_final.groupby(['price'])['price'].count()
price_counts.plot(kind='bar', color='skyblue')
plt.title('Price Counts')
plt.xlabel('Prices')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()


# Counts of tags
tag_counts = bus_final.loc[:, ['delivery', 'pickup', 'restaurant_reservation', 'has_image', 'has_phone', 'has_st_add']].sum()

tag_counts.plot(kind='bar', color='skyblue')
plt.axhline(y = bus_final.shape[0], color = 'r', linestyle = '-')
plt.title('Tag Counts')
plt.xlabel('Tags')
plt.ylabel('Count')
plt.xticks(rotation=50)  # Rotate x-axis labels if needed
plt.show()

# cuisine occurence
cuisine_counts = (bus_final.loc[:, 'acaibowls':'wraps'].sum().to_frame()).sort_values([0], ascending = False)

cuisine_counts[:30].plot(kind='bar', color='skyblue')
plt.title('Cuisine Counts')
plt.xlabel('Cuisines')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# median brs of neighborhoods
bus_final.groupby(["neighborhood"])["brs"].median().to_frame().plot(kind='bar', color='skyblue')
plt.axhline(y = bus_final['brs'].median(), color = 'r', linestyle = '-')
plt.title('Median BRS By Neighborhood')
plt.xlabel('Neighborhood')
plt.ylabel('Median')
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()

# median brs of price
bus_final.groupby(["price"])["brs"].median().to_frame().plot(kind='bar', color='skyblue')
plt.axhline(y = bus_final['brs'].median(), color = 'r', linestyle = '-')
plt.title('Median BRS By Price')
plt.xlabel('Price')
plt.ylabel('Median')
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()


In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import tree


column_names = bus_final.columns.tolist()

# Splitting data into X (features) and y (target)
y = bus_final['brs']
X = bus_final.drop(['id', 'name', 'review_count', 'rating', 'distance', 'neighborhood', 'latitude', 'longitude', 'city', 'zip_code','brs'], axis=1)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

regr = DecisionTreeRegressor(min_samples_split=25, max_depth=10)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [None]:
# calculate mean squared error and r2
print(f"Mean Squared Error: {(mean_squared_error(y_test, y_pred))}")
print('R Squared Score is:', r2_score(y_test, y_pred))

In [None]:
# Plots
text_representation = tree.export_text(regr)
plt.scatter(y_test, y_pred, c='blue')
feature_importances = regr.feature_importances_
feature_importances = pd.DataFrame(regr.feature_importances_, index=X_train.columns, columns=["Importance"])
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(regr, 
                   feature_names=regr.feature_names_in_,  
                   class_names='brs',
                   filled=True)
fig.savefig("decistion_tree.png", dpi=300)


In [None]:
import dtreeviz 

viz_model = dtreeviz.model(regr,
                           X_train, y_train,
                           feature_names=regr.feature_names_in_,
                           target_name='brs')
viz_model.view(scale=0.8)



In [None]:
# looking at out most important features

plt.figure()
custom_labels = ['No Price', '\$', '\$\$', '\$\$\$', '\$\$\$\$']
sns.boxplot(x = X['price'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)
plt.xlabel('')
plt.show()

# Label for others are yes and no
custom_labels = ['No', 'Yes']
plt.figure()
sns.boxplot(x = X['has_image'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['hotdogs'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['pizza'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['delivery'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['chicken_wings'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['Dorchester'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['East Boston'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['Mattapan'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['chinese'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['West End'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['Fenway'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['waffles'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['has_phone'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['kosher'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['South Boston'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['mediterranean'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)
           
plt.figure()
sns.boxplot(x = X['Roxbury'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['Beacon Hill'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['Roslindale'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['cocktailbars'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)

plt.figure()
sns.boxplot(x = X['Wharf District'],
            y = y)
plt.xticks(range(len(custom_labels)), custom_labels)



In [None]:
# Keep only important features for random forest regression
X = X[feature_importances.index[:22].tolist()]

# Run a random forest, gradient boost, and xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

regr = RandomForestRegressor(min_samples_split=25, max_depth=10)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
