In [0]:
%run "./Rebuild Project Tables"

In [0]:
table_list = ['business_category_table', 'ordered_reviews_table', 'user_gender_table']
process_tables(table_list)

building 3 tables provided as list
building business_category_table from existing table files
+------------+
|record_count|
+------------+
|      221928|
+------------+

+----------------------+----------------------+-----+----------------+------------+------------+-------------+--------------+
|           business_id|                  name|state|            city|  metro_area|review_count|     category|category_count|
+----------------------+----------------------+-----+----------------+------------+------------+-------------+--------------+
|kJb4o8aZ_93iBMq-Hery1w|              BRAKEmax|   AZ|          Tucson|      Tucson|           6|   Automotive|             1|
|kJbAV1ZN65XedfED0HQVsw|Guerreros Mexican R...|   IN|    Indianapolis|Indianapolis|          49|  Restaurants|             1|
|kJg3OA_-aXgLnIxizh76Hg|Kat Cares Dog Walki...|   PA|    Philadelphia|Philadelphia|          13|         Pets|             1|
|kJguN_71VpAYPII4_kbaAA|   El Fogon Colombiano|   FL|           Tampa|    

In [0]:
df_weighted_reviews = spark.sql("""
SELECT  R.*,
        B.name, B.state, B.city, B.metro_area, B.review_count, B.category, B.category_count, 
        (1/B.category_count) AS weighted_review
FROM ordered_reviews_table AS R INNER JOIN business_category_table AS B 
ON R.business_id = B.business_id
""")
print("Record Count:", df_weighted_reviews.count())
df_weighted_reviews.show(truncate=22)
df_weighted_reviews.createOrReplaceTempView("weighted_reviews")

Record Count: 11726205
+----------------------+----------------------+----------------------+----+-----+------+-------------------+-----+------------+------------------+-------------+----------------------+-----+----------------+----------+------------+----------------------+--------------+------------------+
|             review_id|           business_id|               user_id|cool|funny|useful|               date|stars|review_order|      business_avg|prior_reviews|                  name|state|            city|metro_area|review_count|              category|category_count|   weighted_review|
+----------------------+----------------------+----------------------+----+-----+------+-------------------+-----+------------+------------------+-------------+----------------------+-----+----------------+----------+------------+----------------------+--------------+------------------+
|rJ3CASyRfG-7ZviCBFCJQg|f19eLfhXqR47Ct8Hz2y_pA|---1lKK3aKOuomHnwAkAow|   0|    0|     0|2018-12-19 22:26:22|  5.0

In [0]:
df_user_reviews = spark.sql("""
SELECT R.*,
       U.name AS user_name, U.yelping_since, U.reviews_written, U.friend_count, U.is_elite, U.first_elite, U.gender, U.gender_ratio
FROM weighted_reviews AS R INNER JOIN user_gender_table AS U
ON R.user_id = U.user_id
""").cache()
print("Record count:", df_user_reviews.count() )
df_user_reviews.show()

Record count: 11726153
+--------------------+--------------------+--------------------+----+-----+------+-------------------+-----+------------+------------------+-------------+--------------------+-----+----------------+------------+------------+----------------+--------------+---------------+-------------+-------------------+---------------+------------+--------+-----------+-------+------------------+
|           review_id|         business_id|             user_id|cool|funny|useful|               date|stars|review_order|      business_avg|prior_reviews|                name|state|            city|  metro_area|review_count|        category|category_count|weighted_review|    user_name|      yelping_since|reviews_written|friend_count|is_elite|first_elite| gender|      gender_ratio|
+--------------------+--------------------+--------------------+----+-----+------+-------------------+-----+------------+------------------+-------------+--------------------+-----+----------------+-----------

#### Step 4b: Generate the `project_table`

In [0]:
process_or_create_table("project_table", "df_user_reviews", summary=True, delete=False)

building project_table from existing table files
+------------+
|record_count|
+------------+
|    11726153|
+------------+

+----------------------+----------------------+----------------------+----+-----+------+-------------------+-----+------------+------------------+-------------+----------------------+-----+------------+------------+------------+--------------------+--------------+---------------+---------+-------------------+---------------+------------+--------+-----------+------+------------------+
|             review_id|           business_id|               user_id|cool|funny|useful|               date|stars|review_order|      business_avg|prior_reviews|                  name|state|        city|  metro_area|review_count|            category|category_count|weighted_review|user_name|      yelping_since|reviews_written|friend_count|is_elite|first_elite|gender|      gender_ratio|
+----------------------+----------------------+----------------------+----+-----+------+-------------

In [0]:
df_project_reviews = spark.sql("""
SELECT review_id, 
       FIRST(user_id) AS user_id,
       FIRST(date) AS date,
       FIRST(stars) AS stars, 
       FIRST(review_order) AS review_order,
       FIRST(name) AS name,
       FIRST(gender) AS gender,
       FIRST(metro_area) AS metro_area,
       FIRST(review_count) AS review_count,
       FIRST(category_count) AS category_count,
       FIRST(user_name) AS user_name,
       FIRST(yelping_since) AS yelping_since,
       FIRST(reviews_written) AS reviews_written,
       COLLECT_LIST(category) AS categories
FROM project_table
GROUP BY review_id
ORDER BY review_id
""")
df_project_reviews.show(truncate=22)

+----------------------+----------------------+-------------------+-----+------------+----------------------+-------+-------------+------------+--------------+---------+-------------------+---------------+----------------------+
|             review_id|               user_id|               date|stars|review_order|                  name| gender|   metro_area|review_count|category_count|user_name|      yelping_since|reviews_written|            categories|
+----------------------+----------------------+-------------------+-----+------------+----------------------+-------+-------------+------------+--------------+---------+-------------------+---------------+----------------------+
|---4VcQZzy_vIIifUDqxsg|EopuF3BhVXAGJWEje_TJ-g|2018-09-16 00:38:13|  1.0|           1|It's Nutts on the C...|      M| Philadelphia|          67|             1|    Brett|2018-09-16 00:33:33|              5|         [Restaurants]|
|---HVvLfXsI5cUzNKl2F2g|vgbW-0hGSjsm2f3-naEzEg|2020-10-30 22:04:14|  4.0|          1

In [0]:
process_or_create_table("project_reviews_table", "df_project_reviews", summary=True, delete=False)

project_reviews_table table exists
+------------+
|record_count|
+------------+
|     6990247|
+------------+

+----------------------+----------------------+-------------------+-----+------------+----------------------+-------+-------------+------------+--------------+---------+-------------------+---------------+----------------------+
|             review_id|               user_id|               date|stars|review_order|                  name| gender|   metro_area|review_count|category_count|user_name|      yelping_since|reviews_written|            categories|
+----------------------+----------------------+-------------------+-----+------------+----------------------+-------+-------------+------------+--------------+---------+-------------------+---------------+----------------------+
|VECOpAoJNnPywQPW0egdRw|d9M9M4dQq6GmjdMoxmhBoQ|2018-11-21 03:00:46|  1.0|           1|          CVS Pharmacy|      F|    Nashville|          15|             3|Catherine|2011-06-28 21:53:17|             

In [0]:
df_useful_reviews = spark.sql("""
SELECT prt.review_id, prt.user_id, prt.categories, prt.stars, prt.review_order, prt.gender, pt.useful
FROM project_reviews_table AS prt INNER JOIN project_table AS pt ON prt.review_id = pt.review_id
""")
df_useful_reviews.show()

+--------------------+--------------------+--------------------+-----+------------+-------+------+
|           review_id|             user_id|          categories|stars|review_order| gender|useful|
+--------------------+--------------------+--------------------+-----+------------+-------+------+
|---4VcQZzy_vIIifU...|EopuF3BhVXAGJWEje...|       [Restaurants]|  1.0|           1|      M|     1|
|---zIHlP_-j4uR_i6...|Cs9vrBCHnjXODulXd...|[Local Services, ...|  3.0|           1|      M|     0|
|---zIHlP_-j4uR_i6...|Cs9vrBCHnjXODulXd...|[Local Services, ...|  3.0|           1|      M|     0|
|--0EbOIsbet_ef-5R...|lYQk0R6sPfo3WeX-l...| [Restaurants, Food]|  5.0|         712|      M|     2|
|--0EbOIsbet_ef-5R...|lYQk0R6sPfo3WeX-l...| [Restaurants, Food]|  5.0|         712|      M|     2|
|--0SsSyXoPu5pbITt...|ggpVElLtijHucOAXP...|[Professional Ser...|  5.0|           1|      M|     1|
|--0SsSyXoPu5pbITt...|ggpVElLtijHucOAXP...|[Professional Ser...|  5.0|           1|      M|     1|
|--1z0MjmP

In [0]:
df_useful_reviews.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stars: double (nullable = true)
 |-- review_order: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- useful: long (nullable = true)



In [0]:
process_or_create_table("useful_reviews_table", "df_useful_reviews", summary=True, delete=False)

building useful_reviews_table from existing table files
+------------+
|record_count|
+------------+
|    11726153|
+------------+

+----------------------+----------------------+----------------------+-----+------------+-------+------+
|             review_id|               user_id|            categories|stars|review_order| gender|useful|
+----------------------+----------------------+----------------------+-----+------------+-------+------+
|---JpOlzUyBISir2Tn_Yhw|l9be6lRlJhumEM8ruMMJ7g|[Shopping, Health &...|  4.0|          62|      F|     1|
|---JpOlzUyBISir2Tn_Yhw|l9be6lRlJhumEM8ruMMJ7g|[Shopping, Health &...|  4.0|          62|      F|     1|
|---JpOlzUyBISir2Tn_Yhw|l9be6lRlJhumEM8ruMMJ7g|[Shopping, Health &...|  4.0|          62|      F|     1|
|---tH-GjQ06oKXGZ7wvUpw|E3sF2lVp38bmORWVwzx6dw|                [Food]|  4.0|           3|      F|     1|
|--0HEoLnywdYjD2QqaS-VA|YEiPZTWD6OKRGroINFkj7Q|         [Restaurants]|  1.0|           2|      F|     0|
|--0Jc1yU2kgmjIsOk8VEjQ|k9rI