In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import os
import re
from functools import reduce

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1579188846172_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# Process business

business = sc.textFile("s3n://usfca-distributedcomputing/yelp_business.csv", 24)

header = business.first()
business = business.filter(lambda row: row != header)

# Clean data before splitting
business = business.map(lambda x: re.sub(',(?=[^"]*"[^"]*(?:"[^"]*"[^"]*)*$)','', x))

def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v)

business = business.map(lambda x: x.split(','))
business = business.map(lambda row: [toDoubleSafe(x) for x in row])
business.take(2)

business_df = ss.createDataFrame(business)

business_df.show(3)

old_columns = business_df.schema.names
new_columns = header.split(',')

business_df = reduce(lambda data, idx: data.withColumnRenamed(old_columns[idx],
                                                              new_columns[idx]),
                     range(len(old_columns)), business_df)

business_df.show(3)

business_df.printSchema()

# Filter and keep only restaurants that are opened
restaurants_df = business_df.filter(business_df['categories'].contains('Restaurant')).filter(business_df['is_open']==1)
restaurants_df.show(2)

restaurants_df.count()

# Keep only interesting variables
restaurants_df = restaurants_df.select("business_id", "latitude", "longitude", "stars", "review_count")
restaurants_df.show(3)

# No Null values
print(restaurants_df.filter(restaurants_df.review_count.isNull()).count())
print(restaurants_df.filter(restaurants_df.business_id.isNull()).count())
print(restaurants_df.filter(restaurants_df.longitude.isNull()).count())
print(restaurants_df.filter(restaurants_df.latitude.isNull()).count())
print(restaurants_df.filter(restaurants_df.stars.isNull()).count())

# Process business attributes

attributes_df = ss.read.csv("s3n://usfca-distributedcomputing/yelp_business_attributes.csv", header=True)

attributes_df = attributes_df.select("BusinessAcceptsCreditCards", "BusinessParking_garage",
                                     "BusinessParking_street", "BusinessParking_lot",
                                     "BusinessParking_valet", "WheelchairAccessible", 
                                     "BikeParking", "Alcohol", "HappyHour", "OutdoorSeating",
                                     "DogsAllowed", "business_id")
attributes_df.printSchema()

# Create Parking boolean column if Car parking available
attributes_df = attributes_df.withColumn("Parking",
                                         (attributes_df['BusinessParking_garage'] == 'True') | 
                                         (attributes_df['BusinessParking_street'] == 'True') |
                                         (attributes_df['BusinessParking_lot'] == 'True') |
                                         (attributes_df['BusinessParking_valet'] == 'True'))
attributes_df = attributes_df.drop('BusinessParking_garage', 'BusinessParking_street',
                                   'BusinessParking_lot', 'BusinessParking_valet')

attributes_df.printSchema()

# Create boolean values for all variables
attributes_df = attributes_df.withColumn("Accepts_Credit_Cards", 
                                         attributes_df['BusinessAcceptsCreditCards'] == 'True')
attributes_df = attributes_df.withColumn("Wheelchair_Accessible", 
                                         attributes_df['WheelchairAccessible'] == 'True')
attributes_df = attributes_df.withColumn("Bike_Parking", 
                                         attributes_df['BikeParking'] == 'True')
attributes_df = attributes_df.withColumn("Alcohol_drinks", 
                                         attributes_df['Alcohol'] == 'True')
attributes_df = attributes_df.withColumn("Happy_Hour", 
                                         attributes_df['HappyHour'] == 'True')
attributes_df = attributes_df.withColumn("Outdoor_Seating", 
                                         attributes_df['OutdoorSeating'] == 'True')
attributes_df = attributes_df.withColumn("Dogs_Allowed", 
                                         attributes_df['DogsAllowed'] == 'True')


attributes_df = attributes_df.drop('BusinessAcceptsCreditCards', 'WheelchairAccessible', 'BikeParking',
                                   'Alcohol', 'HappyHour', 'OutdoorSeating', 'DogsAllowed')

attributes_df.printSchema()

# Process Checkin

checkin_df = ss.read.csv("s3n://usfca-distributedcomputing/yelp_checkin.csv", inferSchema=True, header=True)
checkin_df.show(5)
checkin_df.printSchema()

# check null
print(checkin_df.where(checkin_df.business_id.isNull()).count())
print(checkin_df.where(checkin_df.weekday.isNull()).count())
print(checkin_df.where(checkin_df.hour.isNull()).count())
print(checkin_df.where(checkin_df.checkins.isNull()).count())

# counts of business_id, max 168 (7*24)
id_ct = checkin_df.groupBy("business_id").count().orderBy("count", ascending=False)
id_ct.show(10)

# evenly distributed
checkin_df.groupBy("weekday").count().show()

checkin_df.write.option("path", "s3n://usfca-distributedcomputing/checkin_df").saveAsTable("checkin_df")

# keep ids with at least 7*8=56 occurences
checkin_v1 = ss.sql("with checkin as (select * from parquet.`s3n://usfca-distributedcomputing/checkin_df`) \
                     , id_ct as (select business_id, count(*) as ct from checkin group by business_id having ct >= 56) \
                     select * from checkin where business_id in (select business_id from id_ct)")
print(checkin_v1.count())
checkin_v1.show()

def is_weekend(day):
    if day in ["Sat", "Sun"]:
        return "Weekend"
    else:
        return "Weekday"
    
check_is_weekend = udf(is_weekend)

# flag weekend
checkin_v1 = checkin_v1.select(["business_id", "hour", "checkins", "weekday", check_is_weekend("weekday")])
checkin_v1 = checkin_v1.withColumnRenamed("is_weekend(weekday)", "weekend")
checkin_v1.show(3)

# average hourly checkin numbers
avg_hourly_checkin = checkin_v1.groupBy(["business_id"]).pivot("weekend").agg(avg("checkins"))
print(avg_hourly_checkin.count())
avg_hourly_checkin.show(5)

# Join the data

# Left_outer join
joined_df = restaurants_df.join(attributes_df, 'business_id', 'left_outer')

joined_df.show(2)

joined_df.printSchema()

# We had some restaurant from business table not in business_attributes
# Need to treat the null values
joined_df.groupBy(joined_df["Parking"]).count().orderBy("count", ascending=False).show()

# Replace null values by false (most common value)
joined_df = joined_df.na.fill(False)

joined_df.groupBy(joined_df["Parking"]).count().orderBy("count", ascending=False).show()

joined_df = joined_df.join(avg_hourly_checkin, 'business_id')

sentiment_df = ss.read.csv("s3n://usfca-distributedcomputing/sentiment")
old_cols = sentiment_df.schema.names
new_cols = ['business_id','avg_sentiment']

sentiment_df = reduce(lambda data, idx: data.withColumnRenamed(old_cols[idx],
                                                              new_cols[idx]),
                     range(len(old_cols)), sentiment_df)

joined_df = joined_df.join(sentiment_df, "business_id", how="left_outer")
print(f"Nulls (avg_sentiment): {joined_df.where(joined_df.avg_sentiment.isNull()).count()}")

joined_df.printSchema()


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
# Save data

joined_df.write.option("path", "s3n://usfca-distributedcomputing/yelp_data").saveAsTable('yelp_data')

# Read data

ss.sql("select * from parquet.`s3n://usfca-distributedcomputing/yelp_data`").show(3)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------+--------------+-----+------------+-------+--------------------+---------------------+------------+--------------+----------+---------------+------------+------------------+------------------+-------------------+
|         business_id|     latitude|     longitude|stars|review_count|Parking|Accepts_Credit_Cards|Wheelchair_Accessible|Bike_Parking|Alcohol_drinks|Happy_Hour|Outdoor_Seating|Dogs_Allowed|           Weekday|           Weekend|      avg_sentiment|
+--------------------+-------------+--------------+-----+------------+-------+--------------------+---------------------+------------+--------------+----------+---------------+------------+------------------+------------------+-------------------+
|-d0Ou1JmEK2j5HR4P...|36.0166853346|-114.958591805|  3.0|        32.0|   true|               false|                false|       false|         false|     false|          false|       false|1.6590909090909092|2.8461538461538463|        0.509015625|
|-sNi7U9