In [0]:
df_business_data = spark.read.json('/yelp/business.bz2')

print ("record count:", df_business_data.count() )
df_business_data.show(100)
df_business_data.printSchema()

df_business_data.createOrReplaceTempView("business_data")

record count: 150346
+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|            city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{null, null, null...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|   Santa Barbara|                null|      0|   34.4266787|   -119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{null, null, null...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|          Afft

In [0]:
df_business = spark.sql("""
SELECT business_id, name, categories, state, city, review_count
FROM business_data
""")
print ("record count:", df_business.count() )
df_business.show(truncate=22)
df_business.createOrReplaceTempView("business")

record count: 150346
+----------------------+----------------------+----------------------+-----+--------------+------------+
|           business_id|                  name|            categories|state|          city|review_count|
+----------------------+----------------------+----------------------+-----+--------------+------------+
|Pns2l4eNsfO8kk83dixA6A|Abby Rappoport, LAC...|Doctors, Traditiona...|   CA| Santa Barbara|           7|
|mpf3x-BjTdTEA3yCZrAYPw|         The UPS Store|Shipping Centers, L...|   MO|        Affton|          15|
|tUFrWirKiKi_TAnsVWINQQ|                Target|Department Stores, ...|   AZ|        Tucson|          22|
|MTSW4McQd7CbVtyjqoe9mw|    St Honore Pastries|Restaurants, Food, ...|   PA|  Philadelphia|          80|
|mWMc6_wTdE0EUBKIGXDVfA|Perkiomen Valley Br...|Brewpubs, Breweries...|   PA|    Green Lane|          13|
|CF33F8-E6oudUQ46HnavjQ|        Sonic Drive-In|Burgers, Fast Food,...|   TN|  Ashland City|           6|
|n_0UpQx1hsNbnPUSlodU8w|       Fam

In [0]:
df_metro_areas = spark.read.option("header","true").\
                            option("inferSchema","true").\
csv("/yelp/metro_areas.csv").\
withColumnRenamed("Business Id","business_id").\
withColumnRenamed("Metro Area","metro_area")

print("Number of records:", df_metro_areas.count() )
df_metro_areas.show(truncate=False)
df_metro_areas.printSchema()
df_metro_areas.createOrReplaceTempView("metro_areas")

Number of records: 150346
+----------------------+------------+-----+
|business_id           |metro_area  |State|
+----------------------+------------+-----+
|meTyTAo18ulbEu5appPu3w|Indianapolis|IN   |
|9Mym57cM01EBl2x3h-Ei5Q|Indianapolis|IN   |
|OsacYNC8NUZfGyuYVgyKng|Indianapolis|IN   |
|w7ZyGU1CIIlsb9_xJwbmXg|Indianapolis|IN   |
|MRHTYIU6-jiziTPm1jFP5w|Indianapolis|IN   |
|Lpzofxs_5GdRKIfvFcjTjg|Indianapolis|IN   |
|bBPrDU0lA1vgiQOlDhzuWw|Indianapolis|IN   |
|Jtd9i4DlCO6JNcYzNiBL9g|Indianapolis|IN   |
|z2dibYvPufc8UAGWuVZxJQ|Indianapolis|IN   |
|h0vWIv3xMQQ4L_fYbWAK7Q|Indianapolis|IN   |
|j4S77MajsNN7HV-Ei0YGMg|Indianapolis|IN   |
|cIQv2GqyFLjmGqd6eyvxRQ|Indianapolis|IN   |
|_QAMST-NrQobXduilWEqSw|Indianapolis|IN   |
|3KF-DmXMqnttG5jwe7gXqg|Indianapolis|IN   |
|lCUQ9mpHAHXms53OsgSb0A|Indianapolis|IN   |
|ulWjI6RDPgSusXZEQlZrGA|Indianapolis|IN   |
|jDL99IpiZWPS8ubQQBjtgg|Indianapolis|IN   |
|iAEEhbfvqlPaITFn1jJy1w|Indianapolis|IN   |
|F79qEV3ulKtZrBeENRi0PA|Indianapolis|IN   |
|asG2h

In [0]:
df_metro_business = spark.sql("""
SELECT B.*, metro_area
FROM business AS B INNER JOIN metro_areas AS M ON B.business_id = M.business_id
""")
print("Number of records:", df_metro_business.count() )
df_metro_business.show()
df_metro_business.createOrReplaceTempView("metro_business")

Number of records: 150346
+--------------------+--------------------+--------------------+-----+------------+------------+------------+
|         business_id|                name|          categories|state|        city|review_count|  metro_area|
+--------------------+--------------------+--------------------+-----+------------+------------+------------+
|meTyTAo18ulbEu5ap...|Meineke Car Care ...|Oil Change Statio...|   IN|Indianapolis|           8|Indianapolis|
|9Mym57cM01EBl2x3h...|Elite Beverages No 2|Food, Beer, Wine ...|   IN|Indianapolis|           8|Indianapolis|
|OsacYNC8NUZfGyuYV...|Salamone Bros Gou...|Restaurants, Sand...|   IN|Indianapolis|           8|Indianapolis|
|w7ZyGU1CIIlsb9_xJ...|         Spa & Nails|Day Spas, Beauty ...|   IN|Indianapolis|           8|Indianapolis|
|MRHTYIU6-jiziTPm1...|      Whitson Vision|Shopping, Doctors...|   IN|Indianapolis|           8|Indianapolis|
|Lpzofxs_5GdRKIfvF...|           Hangar 58|Chicken Wings, Am...|   IN|Indianapolis|           

In [0]:
df_categories = spark.read.option("multiLine",True).json("/yelp/categories.json")
print( "number of categories:", df_categories.count() )
df_categories.show()
df_categories.printSchema()
df_categories.createOrReplaceTempView("categories")

number of categories: 1565
+------------------+--------------------+-----------------+---------------+-------------------+
|             alias|   country_blacklist|country_whitelist|        parents|              title|
+------------------+--------------------+-----------------+---------------+-------------------+
|        3dprinting|                null|             null|[localservices]|        3D Printing|
|         abruzzese|                null|             [IT]|      [italian]|          Abruzzese|
|      absinthebars|                null|             [CZ]|         [bars]|      Absinthe Bars|
|         acaibowls|[IT, AR, TR, CL, ...|             null|         [food]|         Acai Bowls|
|       accessories|                null|             null|      [fashion]|        Accessories|
|       accountants|                null|             null| [professional]|        Accountants|
|     acnetreatment|                null|             null|    [beautysvc]|     Acne Treatment|
|            

In [0]:
df_top_categories = spark.sql("""
SELECT title
FROM categories
WHERE size(parents) = 0
""")
print(f"Number of top-level categories: {df_top_categories.count()}")
df_top_categories.show(30, truncate=False)
df_top_categories.createOrReplaceTempView("top_categories")

Number of top-level categories: 22
+----------------------------+
|title                       |
+----------------------------+
|Active Life                 |
|Arts & Entertainment        |
|Automotive                  |
|Beauty & Spas               |
|Bicycles                    |
|Education                   |
|Event Planning & Services   |
|Financial Services          |
|Food                        |
|Health & Medical            |
|Home Services               |
|Hotels & Travel             |
|Local Flavor                |
|Local Services              |
|Mass Media                  |
|Nightlife                   |
|Pets                        |
|Professional Services       |
|Public Services & Government|
|Religious Organizations     |
|Restaurants                 |
|Shopping                    |
+----------------------------+



In [0]:
df_realestate = spark.sql("""
SELECT * 
FROM categories
WHERE title = 'Real Estate'
""")
df_realestate.show()

+----------+-----------------+-----------------+--------------+-----------+
|     alias|country_blacklist|country_whitelist|       parents|      title|
+----------+-----------------+-----------------+--------------+-----------+
|realestate|             null|             null|[homeservices]|Real Estate|
+----------+-----------------+-----------------+--------------+-----------+



In [0]:
df_business_categories = spark.sql("""
SELECT business_id, name, state, city, metro_area, review_count, 
       SPLIT(categories, '\\\s*,\\\s*') AS categories
FROM metro_business
""")
df_business_categories.show()
df_business_categories.select("business_id", "name","categories").show(truncate=False)
df_business_categories.printSchema()
df_business_categories.createOrReplaceTempView("business_categories")

+--------------------+--------------------+-----+------------+------------+------------+--------------------+
|         business_id|                name|state|        city|  metro_area|review_count|          categories|
+--------------------+--------------------+-----+------------+------------+------------+--------------------+
|meTyTAo18ulbEu5ap...|Meineke Car Care ...|   IN|Indianapolis|Indianapolis|           8|[Oil Change Stati...|
|9Mym57cM01EBl2x3h...|Elite Beverages No 2|   IN|Indianapolis|Indianapolis|           8|[Food, Beer, Wine...|
|OsacYNC8NUZfGyuYV...|Salamone Bros Gou...|   IN|Indianapolis|Indianapolis|           8|[Restaurants, San...|
|w7ZyGU1CIIlsb9_xJ...|         Spa & Nails|   IN|Indianapolis|Indianapolis|           8|[Day Spas, Beauty...|
|MRHTYIU6-jiziTPm1...|      Whitson Vision|   IN|Indianapolis|Indianapolis|           8|[Shopping, Doctor...|
|Lpzofxs_5GdRKIfvF...|           Hangar 58|   IN|Indianapolis|Indianapolis|           8|[Chicken Wings, A...|
|bBPrDU0lA

In [0]:
df_business_categories.select("business_id","categories").show(truncate=False)


+----------------------+-----------------------------------------------------------------------------------------------------------------------+
|business_id           |categories                                                                                                             |
+----------------------+-----------------------------------------------------------------------------------------------------------------------+
|meTyTAo18ulbEu5appPu3w|[Oil Change Stations, Tires, Automotive, Auto Repair]                                                                  |
|9Mym57cM01EBl2x3h-Ei5Q|[Food, Beer, Wine & Spirits]                                                                                           |
|OsacYNC8NUZfGyuYVgyKng|[Restaurants, Sandwiches, Barbeque, Southern]                                                                          |
|w7ZyGU1CIIlsb9_xJwbmXg|[Day Spas, Beauty & Spas]                                                                                 

In [0]:
# Complete the FROM (JOIN),  SELECT, and ORDER BY clauses
df_business_top_categories = spark.sql("""
SELECT business_id, name, state, city, metro_area, review_count, COALESCE(title, 'Unknown') AS category
FROM business_categories AS B LEFT OUTER JOIN top_categories AS T ON ARRAY_CONTAINS(categories, title)
ORDER BY SIZE(categories) DESC
""")
print(f"number of businesses:{df_business_top_categories.count()}")
df_business_top_categories.show(100, truncate=30)
df_business_top_categories.createOrReplaceTempView("business_top_categories")


number of businesses:221928
+----------------------+------------------------------+-----+-------------+-------------+------------+-------------------------+
|           business_id|                          name|state|         city|   metro_area|review_count|                 category|
+----------------------+------------------------------+-----+-------------+-------------+------------+-------------------------+
|ytynqOUb3hjKeJfRj5Tshw|       Reading Terminal Market|   PA| Philadelphia| Philadelphia|        5721|                     Food|
|ytynqOUb3hjKeJfRj5Tshw|       Reading Terminal Market|   PA| Philadelphia| Philadelphia|        5721|     Arts & Entertainment|
|ytynqOUb3hjKeJfRj5Tshw|       Reading Terminal Market|   PA| Philadelphia| Philadelphia|        5721|                 Shopping|
|ytynqOUb3hjKeJfRj5Tshw|       Reading Terminal Market|   PA| Philadelphia| Philadelphia|        5721|             Local Flavor|
|ytynqOUb3hjKeJfRj5Tshw|       Reading Terminal Market|   PA| Philade

In [0]:
print(f"Total Businesses:{df_business_top_categories.select('business_id').distinct().count()}")
  
print("Businesses without categories:",df_business_top_categories.filter("category = 'Unknown'").count() )

Total Businesses:150346
Businesses without categories: 103


In [0]:
df_top_category_count = spark.sql("""
SELECT business_id, COUNT(category) AS category_count
FROM business_top_categories
GROUP BY business_id
""")
df_top_category_count.show()
df_top_category_count.createOrReplaceTempView("top_category_count")

+--------------------+--------------+
|         business_id|category_count|
+--------------------+--------------+
|NEb8zLhYvUAmu_f-H...|             1|
|CGhEwiSyXyjSbMhXM...|             4|
|OTfoTKlO8ZlLifPh9...|             1|
|otEpgG6DdLiKtKtQy...|             1|
|ppTEFITIx0DIipyup...|             2|
|VCZOEzPJezhb5vEad...|             1|
|-ajaASaDA_77I6pK3...|             1|
|jH85ddA7oelmS8P2Z...|             1|
|6drlHqomJ6UYFex8n...|             1|
|VV4l-yVqI2RDR0PYt...|             1|
|VzL8jj8IVCGAQUb9f...|             1|
|ueJFblv3esW2xjqO7...|             3|
|L5rH_ypwqJcBByVac...|             1|
|4HMXL85u_wX0WEHuc...|             1|
|7hRaOnXRRS8q620F6...|             2|
|oFYqL4RgThCupYhum...|             3|
|1ZDFYvYjBqmz4z8u_...|             2|
|SzIxeQzrATYbL8IFj...|             1|
|DeDszlU-Gg-Hodu_H...|             1|
|GdsjkFaPVyrMiMmGA...|             1|
+--------------------+--------------+
only showing top 20 rows



In [0]:
# Add a column with the count of the number of top-level categories a business is in
# REMOVE WINDOW FUNCTION FOR category_count AND ORDER BY CLAUSE
df_business_top_category_count = spark.sql("""
SELECT B.*, category_count
FROM business_top_categories AS B JOIN top_category_count AS T ON B.business_id = T.business_id
ORDER BY category_count DESC, business_id
""")
df_business_top_category_count.show(100,truncate=30)

+----------------------+------------------------------+-----+-------------+-------------+------------+----------------------------+--------------+
|           business_id|                          name|state|         city|   metro_area|review_count|                    category|category_count|
+----------------------+------------------------------+-----+-------------+-------------+------------+----------------------------+--------------+
|TfRyyMfqDxtC1KA1jLkmvg|              NEST Center City|   PA| Philadelphia| Philadelphia|          63|              Local Services|             9|
|TfRyyMfqDxtC1KA1jLkmvg|              NEST Center City|   PA| Philadelphia| Philadelphia|          63|                   Nightlife|             9|
|TfRyyMfqDxtC1KA1jLkmvg|              NEST Center City|   PA| Philadelphia| Philadelphia|          63|        Arts & Entertainment|             9|
|TfRyyMfqDxtC1KA1jLkmvg|              NEST Center City|   PA| Philadelphia| Philadelphia|          63|   Event Plannin

#### Step 11b: Write the table

In [0]:
process_or_create_table("business_category_table", "df_business_top_category_count", summary=True, delete=True)

Saving the df_business_top_category_count DataFrame as table: business_category_table
+------------+
|record_count|
+------------+
|      221928|
+------------+

+----------------------+----------------------+-----+----------------+------------+------------+-------------+--------------+
|           business_id|                  name|state|            city|  metro_area|review_count|     category|category_count|
+----------------------+----------------------+-----+----------------+------------+------------+-------------+--------------+
|kJb4o8aZ_93iBMq-Hery1w|              BRAKEmax|   AZ|          Tucson|      Tucson|           6|   Automotive|             1|
|kJbAV1ZN65XedfED0HQVsw|Guerreros Mexican R...|   IN|    Indianapolis|Indianapolis|          49|  Restaurants|             1|
|kJg3OA_-aXgLnIxizh76Hg|Kat Cares Dog Walki...|   PA|    Philadelphia|Philadelphia|          13|         Pets|             1|
|kJguN_71VpAYPII4_kbaAA|   El Fogon Colombiano|   FL|           Tampa|       Tampa

In [0]:
df_top_category_count = spark.sql("""
SELECT category, COUNT(business_id) AS business_count
FROM business_category_table
GROUP BY category
ORDER BY business_count DESC
""")
df_top_category_count.show(100)
df_top_category_count.createOrReplaceTempView("top_category_count")

+--------------------+--------------+
|            category|business_count|
+--------------------+--------------+
|         Restaurants|         52268|
|                Food|         27781|
|            Shopping|         24395|
|       Home Services|         14356|
|       Beauty & Spas|         14292|
|           Nightlife|         12281|
|    Health & Medical|         11890|
|      Local Services|         11198|
|          Automotive|         10773|
|Event Planning & ...|          9895|
|         Active Life|          7687|
|     Hotels & Travel|          5857|
|Arts & Entertainment|          5434|
|                Pets|          3758|
|Professional Serv...|          3270|
|           Education|          1936|
|        Local Flavor|          1604|
|  Financial Services|          1487|
|Public Services &...|          1216|
|Religious Organiz...|           286|
|          Mass Media|           156|
|             Unknown|           103|
|            Bicycles|             5|
+-----------

In [0]:
display(df_top_category_count)

category,business_count
Restaurants,52268
Food,27781
Shopping,24395
Home Services,14356
Beauty & Spas,14292
Nightlife,12281
Health & Medical,11890
Local Services,11198
Automotive,10773
Event Planning & Services,9895


Databricks visualization. Run in Databricks to view.

In [0]:
df_multiple_top_count = spark.sql("""
SELECT category_count, COUNT(business_id) AS business_count 
FROM business_category_table 
GROUP BY category_count 
ORDER BY business_count DESC
""")
df_multiple_top_count.show()
df_multiple_top_count.createOrReplaceTempView("multiple_top_count")

+--------------+--------------+
|category_count|business_count|
+--------------+--------------+
|             1|         94503|
|             2|         86294|
|             3|         30816|
|             4|          7696|
|             5|          1995|
|             6|           528|
|             7|            63|
|             8|            24|
|             9|             9|
+--------------+--------------+



In [0]:
display(df_multiple_top_count)

category_count,business_count
1,94503
2,86294
3,30816
4,7696
5,1995
6,528
7,63
8,24
9,9


Databricks visualization. Run in Databricks to view.