In [2]:
import pandas as pd
import ast as ast
import numpy as np
from typing import *
import regex as re
import matplotlib.pyplot as plt

from task_2_helpers import *
from collections import Counter

# 2.0 Data Read

We started our analysis by reading the business, reviews and users datasets. We select the businesses located in Philadelphia. To do so, we used the Postal Code column. According to out research, Philadelphia postal codes fall in the range of [19019, 19255].


In [3]:
### Data Read
df_business = pd.read_csv('data/ATML2024_businesses.csv', header=0)
print(f"Business data: {df_business.shape}, columns: {list(df_business.columns)}")

df_Train_reviews = pd.read_csv('data/ATML2024_reviews_train.csv', header=0)
print(f"Reviews data: {df_Train_reviews.shape}, columns: {list(df_Train_reviews.columns)}")

df_users = pd.read_csv('data/ATML2024_users.csv', header=0)
print(f"Users data: {df_users.shape}, columns: {list(df_users.columns)}")


Business data: (138210, 11), columns: ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'attributes', 'categories', 'hours']
Reviews data: (1050000, 9), columns: ['id', 'user_id', 'business_id', 'rating', 'useful', 'funny', 'cool', 'text', 'date']
Users data: (747468, 19), columns: ['user_id', 'name', 'user_since', 'useful', 'funny', 'cool', 'premium_account', 'friends', 'fans', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer']


Following data cleaning methods applied to the business data columns:
- **postal_code:** converted to numeric and selected the values in [19019, 19255] for Philadelphia
- **attributes:** cast as dictionary; attributes that value:True are extracted and add to the list of attributes. We also extracted the nested attributes in the similar and add them into list in the form of main attirbute-nested attribute.
- **hours:** cast as dictionary; converted to list of list of working hours per day of the week. Each row has a list size 7, representing the work day and each index have a list of size 2 repsenting the working hour of the day ast [start h, end h]. If a business doens't operate a specific day we put [-1,-1] as working hour.
- **categories:** converted from comma seperated string to list of categories  

After cleaning and selection, we merge business data with ratings data. We only keep the columns that are of our interest for the future usage. Since we are only interested in the restaurant businesses in Philadelphia, we further trim our dataset to contain only the restaurant businesses. We do that by selecting rows those category list includes restaurants. After our many observations we concluded that such a selection criteria is enough to identify the vast majority of restaurant businesses in Philadelphia.

In [6]:
df_business_ph = select_preprocess_Phili_business(df_business)
print("Philadelphia businesses data", df_business_ph.shape)

df_data = pd.merge(df_business_ph[['business_id', 'postal_code_int', 'attributes_list', 'hours_list', 'categories_list', 'latitude', 'longitude']],
                   df_Train_reviews, left_on= 'business_id', right_on='business_id')

print("Philadelphia businesses with ratings data", df_data.shape)

df_data_res = df_data[df_data['categories_list'].apply(lambda x: any('restaurants' in str(a).lower() for a in x)
                       or any('restaurant' in str(a).lower() for a in x))]

print("Philadelphia restaurant businesses with ratings data", df_data_res.shape)

df_data_res.to_parquet('data/ATML2024_Task2_PhiliBussRatings.parquet')
df_data_res.head(5)

Philadelphia businesses data (19707, 15)
Philadelphia businesses with ratings data (176549, 15)
Philadelphia restaurant businesses with ratings data (123414, 15)


Unnamed: 0,business_id,postal_code_int,attributes_list,hours_list,categories_list,latitude,longitude,id,user_id,rating,useful,funny,cool,text,date
0,wm9eoqjytVbC7dQcM4WSTM,19107.0,"[alcohol, bikeparking, businessacceptscreditca...","[[7.0, 20.0], [7.0, 20.0], [7.0, 20.0], [7.0, ...","[bakeries, bubble tea, coffee & tea, food, res...",39.955505,-75.155564,100621,Ax5a6F5AaOoPi-1MwOeuaM,4,1,0,2,So I don't know how the other review was doubl...,2017-08-24 04:23:03
1,wm9eoqjytVbC7dQcM4WSTM,19107.0,"[alcohol, bikeparking, businessacceptscreditca...","[[7.0, 20.0], [7.0, 20.0], [7.0, 20.0], [7.0, ...","[bakeries, bubble tea, coffee & tea, food, res...",39.955505,-75.155564,174063,QZ7nC0AgxLr9B5E0IYguyf,5,0,0,0,I love this place.. I love their chunky fried...,2020-05-25 16:27:36
2,wm9eoqjytVbC7dQcM4WSTM,19107.0,"[alcohol, bikeparking, businessacceptscreditca...","[[7.0, 20.0], [7.0, 20.0], [7.0, 20.0], [7.0, ...","[bakeries, bubble tea, coffee & tea, food, res...",39.955505,-75.155564,234948,gnTp6PHiW40FPYsSwXt37u,5,0,0,1,I ordered so many cake from here for our famil...,2019-07-25 13:09:14
3,wm9eoqjytVbC7dQcM4WSTM,19107.0,"[alcohol, bikeparking, businessacceptscreditca...","[[7.0, 20.0], [7.0, 20.0], [7.0, 20.0], [7.0, ...","[bakeries, bubble tea, coffee & tea, food, res...",39.955505,-75.155564,285755,wKRfxlkzQXfEsHSDYhPVVq,4,0,0,0,Ordered a birthday cake here for the first tim...,2019-03-12 17:04:09
4,wm9eoqjytVbC7dQcM4WSTM,19107.0,"[alcohol, bikeparking, businessacceptscreditca...","[[7.0, 20.0], [7.0, 20.0], [7.0, 20.0], [7.0, ...","[bakeries, bubble tea, coffee & tea, food, res...",39.955505,-75.155564,330446,QikAd9bkgEVafOR5e5EeqW,5,1,0,0,This is my favorite bakery in Chinatown! It's ...,2017-09-13 00:38:08
