In [53]:
import pandas as pd

# Path to your Yelp dataset file
file_path = '../data/yelp_dataset/yelp_academic_dataset_business.json'

# Load the JSON Lines file into a Pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the first few rows
print(df.head())

              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

In [54]:
# Check the column names
print(df.columns)

# Get basic information about the dataset
print(df.info())

# View summary statistics for numerical columns
print(df.describe())


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         12

In [60]:
# Filter businesses located in NYC
nj_businesses = df[df['state'] == 'NJ']

# Display the first few rows
print(nj_businesses.head())


                business_id                                  name  \
42   lwItZ1Ck3KtpCgG4CPFmpQ         Stomel Elliot Attorney-At-Law   
73   8rb-3VYXE37IZix4yOdskw           Sharky's Sports Bar & Grill   
83   NZ_bFJma7brQUfln5h1UAg                   Super Sushi Kyo Hin   
88   LhpPSrulqVeTyJeK2xydvQ                 Fresh Fruits & Salads   
111  H0NEOp4e3Zu598u6kO3y0g  America's Best Contacts & Eyeglasses   

                               address          city state postal_code  \
42                  532 Rte 70 W, Fl 2   Cherry Hill    NJ       08002   
73              820 N Black Horse Pike  Williamstown    NJ       08094   
83                2501 Mt Holly Rd 245    Burlington    NJ       08016   
88                        114 N 3rd St        Camden    NJ       08102   
111  79 Route 73, Ste 6, Coopers Plaza      Voorhees    NJ       08043   

      latitude  longitude  stars  review_count  is_open  \
42   39.915478 -75.016973    5.0             5        1   
73   39.696801 -74.999

In [61]:
# Select only relevant columns
columns_of_interest = ['name', 'address', 'city', 'state', 'postal_code', 'stars', 'categories']
nj_subset = nj_businesses[columns_of_interest]

# Display the subset
print(nj_subset.head())


                                     name                            address  \
42          Stomel Elliot Attorney-At-Law                 532 Rte 70 W, Fl 2   
73            Sharky's Sports Bar & Grill             820 N Black Horse Pike   
83                    Super Sushi Kyo Hin               2501 Mt Holly Rd 245   
88                  Fresh Fruits & Salads                       114 N 3rd St   
111  America's Best Contacts & Eyeglasses  79 Route 73, Ste 6, Coopers Plaza   

             city state postal_code  stars  \
42    Cherry Hill    NJ       08002    5.0   
73   Williamstown    NJ       08094    2.5   
83     Burlington    NJ       08016    3.5   
88         Camden    NJ       08102    4.5   
111      Voorhees    NJ       08043    3.5   

                                            categories  
42   DUI Law, Professional Services, Lawyers, Crimi...  
73   American (Traditional), Bars, Nightlife, Sport...  
83     Restaurants, Japanese, Sushi Bars, Asian Fusion  
88   Juice Bar

In [62]:
# Count the number of businesses in each category
print(nj_subset['categories'].value_counts().head(10))

# Average star rating by category
print(nj_subset.groupby('categories')['stars'].mean().sort_values(ascending=False).head(10))


categories
Restaurants, Pizza             96
Beauty & Spas, Nail Salons     85
Pizza, Restaurants             79
Nail Salons, Beauty & Spas     77
Restaurants, Chinese           72
Chinese, Restaurants           65
Italian, Restaurants           36
Restaurants, Italian           33
Italian, Restaurants, Pizza    31
Food, Beer, Wine & Spirits     28
Name: count, dtype: int64
categories
Home Services, Masonry/Concrete, Stucco Services, Demolition Services             5.0
Bars, Food, Nightlife, Beer, Wine & Spirits                                       5.0
Barbers, Hair Stylists, Men's Hair Salons, Beauty & Spas, Hair Salons             5.0
Pool Halls, Shopping, Sporting Goods, Nightlife                                   5.0
Popcorn Shops, Food, Specialty Food                                               5.0
Chocolatiers & Shops, Food, Candy Stores, Specialty Food                          5.0
Health & Medical, Chiropractors, Medical Centers, Massage Therapy, Acupuncture    5.0
Asian Fusi

In [80]:
# Filter for businesses in the state of Illinois (IL)
new_jersey_restaurants = df[(df['state'] == 'NJ') & (df['categories'].str.contains('Restaurant', na=False))]

# Display the first few rows
print(new_jersey_restaurants.head())


                business_id                         name  \
73   8rb-3VYXE37IZix4yOdskw  Sharky's Sports Bar & Grill   
83   NZ_bFJma7brQUfln5h1UAg          Super Sushi Kyo Hin   
88   LhpPSrulqVeTyJeK2xydvQ        Fresh Fruits & Salads   
129  Cdytv_YhJO0W0Ab8t2xLtg                 Nipper's Pub   
167  Ms5xG8i4p80KSMcF3tt4ug     Sal DeForte's Ristorante   

                    address          city state postal_code   latitude  \
73   820 N Black Horse Pike  Williamstown    NJ       08094  39.696801   
83     2501 Mt Holly Rd 245    Burlington    NJ       08016  40.041629   
88             114 N 3rd St        Camden    NJ       08102  39.946690   
129          1205 Delsea Dr      Woodbury    NJ       08096  39.844468   
167        1400 Parkway Ave         Ewing    NJ       08628  40.266888   

     longitude  stars  review_count  is_open  \
73  -74.999821    2.5            29        1   
83  -74.825821    3.5             6        0   
88  -75.123327    4.5             6        1   
12

In [74]:
# Number of restaurants in New 
print(f"Number of restaurants in Illinois: {len(illinois_restaurants)}")

# List unique cities in Illinois
print(illinois_restaurants['city'].unique())

# Average star rating for Illinois restaurants
print(illinois_restaurants['stars'].mean())


Number of restaurants in Illinois: 983
['Alton' 'Fairview Heights' 'Edwardsville' 'Lebanon' 'Godfrey' 'Mascoutah'
 'Glen Carbon' 'Granite City' 'Belleville' 'Waterloo' 'Cahokia' 'Freeburg'
 'Swansea' 'Collinsville' 'Maryville' 'Columbia' 'Shiloh' 'Troy'
 'O Fallon' 'Wood River' 'Millstadt' 'Dupo' "O'Fallon" 'Caseyville'
 "O'fallon" 'Pontoon Beach' 'East Alton' 'Bellville' 'Rosewood Heights'
 'Bethalto' 'Fairview Hts.' 'East Saint Louis' 'East St. Louis'
 'Scott Air Force Base' 'Smithton' 'Fairview Hts' 'Fairmont City'
 "O' Fallon" 'Scott AFB' 'Scott Afb' 'Cottage Hills' 'East St Louis'
 'Sauget' 'Washington Park' 'Madison' 'St. Louis' 'Foster Pond']
3.304170905391658


In [79]:
states = set()
for state in df['state']:
    states.add(state)
pp(states)

{'AB',
 'AZ',
 'CA',
 'CO',
 'DE',
 'FL',
 'HI',
 'ID',
 'IL',
 'IN',
 'LA',
 'MA',
 'MI',
 'MO',
 'MT',
 'NC',
 'NJ',
 'NV',
 'PA',
 'SD',
 'TN',
 'TX',
 'UT',
 'VI',
 'VT',
 'WA',
 'XMS'}
