In [1]:
from IPython.display import display, HTML

display(HTML("<h2 style='color: #2E86C1;'>STEP 1 — Load the datasets</h2>"))

In [2]:
import pandas as pd
import numpy as np
import json

In [4]:
Business = pd.read_json("C:\ScourgifyDataProject\Yelp JSON\yelp_dataset\yelp_academic_dataset_business.json", lines=True)

In [5]:
print(Business.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB
None


This DataFrame contains 150k business records with 14 columns.It includes basic info (name, address), location (lat/long), ratings (stars, review_count), status (is_open), and extra details like attributes, categories, and hours.

In [6]:
print(Business.dtypes)

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object


In [7]:
display(HTML("<h2 style='color: #2E86C1;'>STEP 2 — Keep only useful columns</h2>"))

In [8]:
columns_to_drop = [
    'postal_code',      
    'latitude',         
    'longitude',        
    'hours',
    'address',
    'city',
    'attributes',
    'is_open',
    'review_count',
    'stars',
    'name',
    'state',  
]

In [9]:
Business = Business.drop(columns=columns_to_drop)

In [10]:
Business

Unnamed: 0,business_id,categories
0,Pns2l4eNsfO8kk83dixA6A,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,"Brewpubs, Breweries, Food"
...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,"Nail Salons, Beauty & Spas"
150342,c8GjPIOTGVmIemT7j5_SyQ,"Pets, Nurseries & Gardening, Pet Stores, Hobby..."
150343,_QAMST-NrQobXduilWEqSw,"Shopping, Jewelry, Piercing, Toy Stores, Beaut..."
150344,mtGm22y5c2UHNXDFAjaPNw,"Fitness/Exercise Equipment, Eyewear & Optician..."


In [11]:
display(HTML("<h2 style='color: #2E86C1;'>STEP 3 — Clean missing values</h2>"))

In [12]:
print("\nRows with ANY missing values:", df.isnull().any(axis=1).sum())


Rows with ANY missing values: 32728


In [13]:
Business.dropna(subset=['categories'], inplace=True)

In [14]:
print(f"categories missing after: {Business['categories'].isnull().sum()}")

categories missing after: 0


In [15]:
display(HTML("<h2 style='color: #2E86C1;'>STEP 4 — Remove duplicated rows</h2>"))

In [16]:
Business.drop_duplicates(subset=['business_id'], inplace=True)

In [22]:
duplicate_count = Business['business_id'].duplicated().sum()
print(f"Duplicate business_ids found: {duplicate_count}")

Duplicate business_ids found: 0


In [23]:
display(HTML("<h2 style='color: #2E86C1;'>STEP 5 — Filter only tourism-related businesses</h2>"))

In [24]:
tourism_keywords = [
    "Hotel", "Resort", "Attraction", "Tours", "Travel", "Museum",
    "Park", "Beach", "Landmark", "Tourist", "Vacation", "Restaurant"
]

In [25]:
Business['categories'] = Business['categories'].fillna("").astype(str)

tourism_business = Business[
    Business['categories'].apply(
        lambda x: any(keyword in x for keyword in tourism_keywords)
    )
]


In [26]:
Business

Unnamed: 0,business_id,categories
0,Pns2l4eNsfO8kk83dixA6A,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,"Brewpubs, Breweries, Food"
...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,"Nail Salons, Beauty & Spas"
150342,c8GjPIOTGVmIemT7j5_SyQ,"Pets, Nurseries & Gardening, Pet Stores, Hobby..."
150343,_QAMST-NrQobXduilWEqSw,"Shopping, Jewelry, Piercing, Toy Stores, Beaut..."
150344,mtGm22y5c2UHNXDFAjaPNw,"Fitness/Exercise Equipment, Eyewear & Optician..."


In [29]:
Business.to_csv("Business.csv", index=False)