In [16]:
#Yelp business data
business_path = "hdfs:///yelp/yelp/yelp_academic_dataset_business.json"
yelp_business = spark.read.json(business_path)
# Check which states Yelp concentrates on
yelp_business.select("state").rdd.map(lambda x: (x, 1)).reduceByKey(lambda x,y : x+y).collect()

[(Row(state=u'IL'), 1),
 (Row(state=u'AZ'), 2),
 (Row(state=u'VA'), 2),
 (Row(state=u'ON'), 2),
 (Row(state=u'OR'), 25175),
 (Row(state=u'ABE'), 1),
 (Row(state=u'MN'), 1),
 (Row(state=u'BC'), 17298),
 (Row(state=u'NM'), 1),
 (Row(state=u'NC'), 1),
 (Row(state=u'NY'), 2),
 (Row(state=u'OH'), 11258),
 (Row(state=u'DE'), 1),
 (Row(state=u'DC'), 1),
 (Row(state=u'HI'), 1),
 (Row(state=u'AL'), 1),
 (Row(state=u'KY'), 1),
 (Row(state=u'NH'), 4),
 (Row(state=u'GA'), 18090),
 (Row(state=u'MA'), 36012),
 (Row(state=u'FL'), 21907),
 (Row(state=u'WY'), 1),
 (Row(state=u'CO'), 3198),
 (Row(state=u'CA'), 13),
 (Row(state=u'KS'), 1),
 (Row(state=u'TX'), 24485),
 (Row(state=u'ME'), 1),
 (Row(state=u'MI'), 1),
 (Row(state=u'OK'), 1),
 (Row(state=u'WI'), 1),
 (Row(state=u'WA'), 3121)]

In [17]:
# Here I select businesses in Georgia
yelp_business = yelp_business.filter(yelp_business["state"] == "GA")
yelp_business.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+--------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|          city|               hours|is_open|     latitude|     longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+--------------+--------------------+-----------+------------+-----+-----+
| 1046 Memorial Dr SE|[,,,,,,,,, True, ...|PE9uqAjdw0E4-8mjG...|Gyms, Active Life...|       Atlanta|[16:0-19:0, 16:0-...|      1|   33.7470274|   -84.3534244|   Crossfit Terminus|      30316|          14|  4.0|   GA|
|    5510 Memorial Dr|[,,,,,,,,, True,,...|g7CEhqBIpwTg6ERcM...|Oil Change Statio...|Stone Mountain|[8:0-18:30, 8:0-1...|      1|   

In [3]:
yelp_business.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [18]:
yelp_business.count()

18090

In [25]:
#Yelp review data
review_path = "hdfs:///yelp/yelp/yelp_academic_dataset_review.json"
yelp_review = spark.read.json(review_path)
# Here I only choose reviews on businesses in Georgia
yelp_review = yelp_review.join(yelp_business, "business_id", "right").select("business_id", "user_id", yelp_review.stars)
yelp_review.show(5)

+--------------------+--------------------+-----+
|         business_id|             user_id|stars|
+--------------------+--------------------+-----+
|0GBGRkKM6-89K4v_m...|opnMQggysKmjRZ597...|  4.0|
|0GBGRkKM6-89K4v_m...|asDqitqpDj483-Wf6...|  4.0|
|0GBGRkKM6-89K4v_m...|PB1m9DzXeCS7Y74Pe...|  1.0|
|0GBGRkKM6-89K4v_m...|RzRN1--utJLPNTZZy...|  5.0|
|0GBGRkKM6-89K4v_m...|Mk7PE1LREOtfhbD86...|  1.0|
+--------------------+--------------------+-----+
only showing top 5 rows



In [26]:
yelp_review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- stars: double (nullable = true)



In [27]:
yelp_review.count()

1150884

In [36]:
#Yelp user data
from pyspark.sql.functions import avg
user_path = "hdfs:///yelp/yelp/yelp_academic_dataset_user.json"
yelp_user = spark.read.json(user_path)
# Here I only choose users who have given at least one review on businesses in Georgia
yelp_user = yelp_user.join(yelp_review, "user_id", "right").select("user_id", "stars", "name")
# Calculating the average stars of each user giving to the chosen businesses
yelp_user = yelp_user.groupBy("user_id").avg("stars").withColumnRenamed("avg(stars)", "avg_stars")
yelp_user.show(5)

+--------------------+------------------+
|             user_id|         avg_stars|
+--------------------+------------------+
|--hJsDxzXZURcLxaL...|               5.0|
|-1KKYzibGPyUX-Mwk...|               5.0|
|-3OVrB4JWtQRdc4z9...|               1.0|
|-5KsgNvefcsmh7BL8...|3.6666666666666665|
|-6IVb5e2YX3_xylub...|               4.0|
+--------------------+------------------+
only showing top 5 rows



In [37]:
yelp_user.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- avg_stars: double (nullable = true)



In [38]:
yelp_user.count()

345324

## Build the Bipartite Graph

In [None]:
# "U" is the prefix for user id and "B" is the prefix for business id to make the vertex ids unique 
# (i.e. to avoid an user id with the same value of a business id)
businessVertices = yelp_business.rdd.map(lambda r: Row(id="B"+str(r.business_id), vtype="business", avg_stars=r.stars)).toDF()
userVertices = yelp_user.rdd.map(lambda r: Row(id="U"+str(r.user_id), vtype="user", avg_stars=r.avg_stars)).toDF()

reviewEdges = yelp_review.rdd.map(lambda r: Row(src="U"+str(r.user_id),dst="B"+str(r.business_id), stars=r.stars)).toDF()

# build the graph
bipartiteVertices = userVertices.unionAll(businessVertices)
bipartiteGraph = GraphFrame(bipartiteVertices, reviewEdges)