## YELP - ALS recommendation System with Pyspark

Members:

**1. Rolamjaya Hotmartua**

**2. YiChin Tzou**

**3. Zoey Chen**

In [1]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import isnan, when, count, col, avg, first
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, BooleanType, DateType, FloatType
import pandas as pd
from functools import reduce
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.window import Window
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .master('yarn') \
  .appName('YelpBigData') \
  .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/06 04:00:30 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/03/06 04:00:30 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/03/06 04:00:30 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/03/06 04:00:31 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [3]:
bucket = "msca-bdp-student-gcs/group3_final"
spark.conf.set('temporaryGcsBucket', bucket)

In [4]:
#from pyspark.context import SparkContext
#from graphframes import *

In [5]:
# #create Spark session
# spark = SparkSession.builder.appName('YelpBigDataML').config("spark.jars.packages", "graphframes:graphframes:0.8.1-spark2.4-s_2.11").getOrCreate()

# #change configuration settings on Spark 
# conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','8g')])

# #print spark configuration settings
# spark.sparkContext.getConf().getAll()

### 1) Data Exploration

#### a. Read data into Spark   

In [6]:
df_user = spark.read\
    .json("gs://msca-bdp-student-gcs/group3_final/yelp_academic_dataset_user.json" )

                                                                                

In [7]:
df_user.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [8]:
df_review = spark.read\
    .json("gs://msca-bdp-student-gcs/group3_final/yelp_academic_dataset_review.json" )

                                                                                

In [9]:
df_review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [10]:
df_business = spark.read\
    .json("gs://msca-bdp-student-gcs/group3_final/yelp_academic_dataset_business.json" )

23/03/06 04:01:13 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [11]:
df_business.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [12]:
df_checkin = spark.read\
    .json("gs://msca-bdp-student-gcs/group3_final/yelp_academic_dataset_checkin.json" )

                                                                                

In [13]:
df_checkin.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)



In [14]:
df_tip = spark.read\
    .json("gs://msca-bdp-student-gcs/group3_final/yelp_academic_dataset_tip.json" )

                                                                                

In [15]:
df_tip.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- compliment_count: long (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)



In [16]:
df_user.show(5)

+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+------+------------+------+--------------------+-------------------+
|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer| cool|               elite|fans|             friends|funny|  name|review_count|useful|             user_id|      yelping_since|
+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+------+------------+------+--------------------+-------------------+
|         3.91|    

In [17]:
df_review.show(5)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|   0|2018-07-07 22:09:11|    0|KU_O5udG6zpxOg-Vc...|  3.0|If you decide to ...|     0|mh_-eMZ6K5RLWhZyI...|
|7ATYjTIgM3jUlt4UM...|   1|2012-01-03 15:28:18|    0|BiTunyQ73aT9WBnpR...|  5.0|I've taken a lot ...|     1|OyoGAe7OKpv6SyGZT...|
|YjUWPpI6HXG530lwP...|   0|2014-02-05 20:30:30|    0|saUsX_uimxRlCVr67...|  3.0|Family diner. Had...|     0|8g_iMtfSiwikVnbP2...|
|kxX2SOes4o-D3ZQBk...|   1|2015-01-04 00:01:03|    0|AqPFMleE6RsU23_au...|  5.0|Wow!  Yummy, diff...|     1|_7bHUi9Uuf5__HHc_...|
|e4Vwtrqf-wpJfwesg...|   1|2017-01-14 20:54:15|    0|Sx8TMOWLNuJBWer-0...|  4.0|Cute inter

In [18]:
df_business.show(5)

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{null, null, null...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|Santa Barbara|                null|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{null, null, null...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|       Affton|{8:0-18:30, 0:0-0...|      1| 38.551126|  -90.335695|    

In [19]:
df_checkin.show(5)

+--------------------+--------------------+
|         business_id|                date|
+--------------------+--------------------+
|---kPU91CF4Lq2-Wl...|2020-03-13 21:10:...|
|--0iUa4sNDFiZFrAd...|2010-09-13 21:43:...|
|--30_8IhuyMHbSOcN...|2013-06-14 23:29:...|
|--7PUidqRWpRSpXeb...|2011-02-15 17:12:...|
|--7jw19RH9JKXgFoh...|2014-04-21 20:42:...|
+--------------------+--------------------+
only showing top 5 rows



In [20]:
df_tip.show(5)

+--------------------+----------------+-------------------+--------------------+--------------------+
|         business_id|compliment_count|               date|                text|             user_id|
+--------------------+----------------+-------------------+--------------------+--------------------+
|3uLgwr0qeCNMjKenH...|               0|2012-05-18 02:17:21|Avengers time wit...|AGNUgVwnZUey3gcPC...|
|QoezRbYQncpRqyrLH...|               0|2013-02-05 18:35:10|They have lots of...|NBN4MgHP9D3cw--Sn...|
|MYoRNLb5chwjQe3c_...|               0|2013-08-18 00:56:08|It's open even wh...|-copOvldyKh1qr-vz...|
|hV-bABTK-glh5wj31...|               0|2017-06-27 23:05:38|Very decent fried...|FjMQVZjSqY8syIO-5...|
|_uN0OudeJ3Zl_tf6n...|               0|2012-10-06 19:43:09|Appetizers.. plat...|ld0AperBXk1h6Ubqm...|
+--------------------+----------------+-------------------+--------------------+--------------------+
only showing top 5 rows



### ALS Recommendation System

In [21]:
# import matplotlib.pyplot as plt
# from pyspark.sql.functions import isnan, when, count, col, avg, first
# import pyspark.sql.functions as F
# from pyspark.sql.types import IntegerType, BooleanType, DateType, FloatType
# import pandas as pd
# from functools import reduce
# # from pyspark.ml.recommendation import ALS
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.sql.window import Window
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#### Check Business Table

In [22]:
df_business.show(5)

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|  latitude|   longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+----------+------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{null, null, null...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|Santa Barbara|                null|      0|34.4266787|-119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{null, null, null...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|       Affton|{8:0-18:30, 0:0-0...|      1| 38.551126|  -90.335695|    

In [23]:
df_business.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

#### Flitering Restaurants and Food

In [24]:
df_restaurant_IL = df_business.filter(df_business.state.like("IL"))

In [25]:
df_restaurant_IL.show(5)

+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+-------+----------+--------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|            city|               hours|is_open|  latitude|     longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+-------+----------+--------------+--------------------+-----------+------------+-----+-----+
|     722 Holyoake Rd|{null, null, null...|LcAozWCMLGjwRboka...|Museums, Kids Act...|    Edwardsville|{9:30-17:30, 10:0...|      1|38.8043945|   -89.9497332|Edwardsville Chil...|      62025|          12|  4.5|   IL|
|        312 Piasa St|{null, null, u'fu...|ljxNT9p0y7YMPx0fc...|Restaurants, Spec...|           Alton|{16:0-22:0, 0:0-0...|      1| 38.8

In [26]:
df_restaurant_IL_food = df_restaurant_IL.filter(df_restaurant_IL.categories.like("%Restaurants%"))

In [27]:
df_restaurant_IL_rf = df_restaurant_IL_food.drop('attributes','int64_field_0', 'categories','review_count', 'address','postal_code','latitude', 'longitude', 'stars', 'is_open')

In [28]:
df_restaurant_IL_rf.show(5)

+--------------------+----------------+--------------------+--------------------+-----+
|         business_id|            city|               hours|                name|state|
+--------------------+----------------+--------------------+--------------------+-----+
|ljxNT9p0y7YMPx0fc...|           Alton|{16:0-22:0, 0:0-0...|Tony's Restaurant...|   IL|
|EuRGgOwJ0g1vTj2R0...|Fairview Heights|{12:0-23:0, 12:0-...|         Crafty Crab|   IL|
|uYBx50Yl_Jc-UYwFz...|    Edwardsville|{11:0-21:0, 0:0-0...|Chew Doin Fried C...|   IL|
|zjQDk4tZyhEroyqtk...|         Lebanon|{11:0-2:0, 11:0-2...|The Cobblestone E...|   IL|
|gI34hM-873fwF1XQ8...|         Godfrey|{11:0-21:0, null,...|Joe’s Pizza and P...|   IL|
+--------------------+----------------+--------------------+--------------------+-----+
only showing top 5 rows



In [29]:
spark = SparkSession \
  .builder \
  .master('yarn') \
  .appName('YelpBigDataML') \
  .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

#### Filtering Reviews

In [30]:
df_review = df_review.drop('cool','funny','useful', 'date', 'review_id', 'compliment_count')


In [31]:
df_review.show(5)

+--------------------+-----+--------------------+--------------------+
|         business_id|stars|                text|             user_id|
+--------------------+-----+--------------------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|  3.0|If you decide to ...|mh_-eMZ6K5RLWhZyI...|
|7ATYjTIgM3jUlt4UM...|  5.0|I've taken a lot ...|OyoGAe7OKpv6SyGZT...|
|YjUWPpI6HXG530lwP...|  3.0|Family diner. Had...|8g_iMtfSiwikVnbP2...|
|kxX2SOes4o-D3ZQBk...|  5.0|Wow!  Yummy, diff...|_7bHUi9Uuf5__HHc_...|
|e4Vwtrqf-wpJfwesg...|  4.0|Cute interior and...|bcjbaE6dDog4jkNY9...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



#### Join the restaurant table & users together, will be able to see the users & their ratings for the restaurant

In [32]:
df_user_review_for_restaurant = df_restaurant_IL_rf.join(df_review, ['business_id'], "inner").drop("text").drop("attributes")

In [33]:
df_user_review_for_restaurant.show(5)

                                                                                

+--------------------+------------+--------------------+--------------------+-----+-----+--------------------+
|         business_id|        city|               hours|                name|state|stars|             user_id|
+--------------------+------------+--------------------+--------------------+-----+-----+--------------------+
|WCVFqK84i2H5EClSo...|Granite City|{7:0-20:0, 7:0-20...|          Itty-Bitty|   IL|  5.0|LWTvg7RlyNLXIf482...|
|xR3inMR2KceU3b9dh...|Granite City|{11:0-23:0, 11:0-...|Lascelles Granite...|   IL|  4.0|1EecTw-Iojid62ySM...|
|_uyLoz0BbrQIVmUEm...|       Alton|                null|   Jimmy The Greek's|   IL|  3.0|c35f8FumQy5045zFb...|
|MVmJuns9ZD9QLGltc...|   Mascoutah|{11:0-20:0, null,...|Triple R's Smokeh...|   IL|  5.0|rqYkeGG9PBAdMRDvT...|
|bQKqeYwaHNal7JFAX...|  Belleville|{10:0-22:0, 10:0-...|         Super Gyros|   IL|  5.0|ADsAwRQC3nvNvo4Qd...|
+--------------------+------------+--------------------+--------------------+-----+-----+--------------------+
o

In [34]:
#clean up a bit, drop the duplicate
df_user_review_for_restaurant_1 = df_user_review_for_restaurant.dropDuplicates()

In [35]:
df_user_review_for_restaurant_1.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- hours: struct (nullable = true)
 |    |-- Friday: string (nullable = true)
 |    |-- Monday: string (nullable = true)
 |    |-- Saturday: string (nullable = true)
 |    |-- Sunday: string (nullable = true)
 |    |-- Thursday: string (nullable = true)
 |    |-- Tuesday: string (nullable = true)
 |    |-- Wednesday: string (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- user_id: string (nullable = true)



#### Converting Business_id into integers (for ALS)

In [36]:
#rename to a shorter table
#The DENSE_RANK window function determines the rank of a value in a group of values, based on the ORDER BY expression in the OVER clause. 

df1 = df_user_review_for_restaurant_1.withColumn("business_id_int", F.dense_rank().over(Window.orderBy(df_user_review_for_restaurant_1.business_id)))

In [37]:
df1.show(5)

23/03/06 04:01:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:22 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:23 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:36 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:37 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+--------------------+--------+--------------------+--------------+-----+-----+--------------------+---------------+
|         business_id|    city|               hours|          name|state|stars|             user_id|business_id_int|
+--------------------+--------+--------------------+--------------+-----+-----+--------------------+---------------+
|-0epFLgYq2C1Jo_W4...|Columbia|{6:0-16:0, 6:0-16...|Our House Cafe|   IL|  1.0|UTgMxoEY9k9ofPibn...|              1|
|-0epFLgYq2C1Jo_W4...|Columbia|{6:0-16:0, 6:0-16...|Our House Cafe|   IL|  4.0|Iviwl-0Fi8cUJmJrN...|              1|
|-0epFLgYq2C1Jo_W4...|Columbia|{6:0-16:0, 6:0-16...|Our House Cafe|   IL|  5.0|3XXlnEnVem1OxDrJi...|              1|
|-0epFLgYq2C1Jo_W4...|Columbia|{6:0-16:0, 6:0-16...|Our House Cafe|   IL|  4.0|xrjG1Hvn-HHH5ndaM...|              1|
|-0epFLgYq2C1Jo_W4...|Columbia|{6:0-16:0, 6:0-16...|Our House Cafe|   IL|  4.0|xdQzGzNu3nIUEvOGP...|              1|
+--------------------+--------+--------------------+------------

                                                                                

#### Converting Users_id into integers (for ALS)

In [38]:
#The DENSE_RANK window function determines the rank of a value in a group of values, based on the ORDER BY expression in the OVER clause. 

df1 = df1.withColumn("user_id_int", F.dense_rank().over(Window.orderBy(df1.user_id)))


In [39]:
df1.show(5)

23/03/06 04:01:37 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:37 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:37 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:38 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:38 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+--------------------+-----------+--------------------+--------------------+-----+-----+--------------------+---------------+-----------+
|         business_id|       city|               hours|                name|state|stars|             user_id|business_id_int|user_id_int|
+--------------------+-----------+--------------------+--------------------+-----+-----+--------------------+---------------+-----------+
|X_V6MOX7gKk94TBUA...|Glen Carbon|{11:0-1:0, null, ...|The Wooden Nickel...|   IL|  1.0|--2eQbUFyRvXKgc-A...|            534|          1|
|3gwwGdwihsmb4TOnU...|      Alton|{5:30-0:0, 5:30-2...|          McDonald's|   IL|  1.0|--FEYzujiiMt7r5QC...|             93|          2|
|OWjMDi-7-jWB4WA91...|    Godfrey|{11:0-21:0, 11:0-...|          Chic N Pig|   IL|  2.0|--FEYzujiiMt7r5QC...|            401|          2|
|S9Yq0v8Re7Bz3slcB...|      Alton|{0:0-0:0, 0:0-0:0...|      Steak ’n Shake|   IL|  1.0|--FEYzujiiMt7r5QC...|            456|          2|
|ipllQQ16wlyHKIkCY...|    Godfrey|

                                                                                

#### Set up a final data frame for ALS model 
#### Leave the table with the converting Business_id & users_id & ratings only

In [40]:
#leaving converting Business_id & users_id only
#thus rename them

df_final = df1.drop('business_id','user_id','name','state','hours','city')
df_final = df_final.withColumnRenamed("business_id_int","business_id")
df_final = df_final.withColumnRenamed("user_id_int","user_id")
df_final.show(5)

23/03/06 04:01:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:01:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----+-----------+-------+
|stars|business_id|user_id|
+-----+-----------+-------+
|  1.0|        534|      1|
|  1.0|         93|      2|
|  2.0|        401|      2|
|  1.0|        456|      2|
|  1.0|        711|      2|
+-----+-----------+-------+
only showing top 5 rows



                                                                                

In [41]:
#Again, dropping the duplicate
df_final = df_final.dropDuplicates()

In [42]:
df_final.show(5)
df_final.count()

23/03/06 04:02:03 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:03 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:03 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:03 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:14 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----+-----------+-------+
|stars|business_id|user_id|
+-----+-----------+-------+
|  1.0|        534|      1|
|  1.0|         93|      2|
|  2.0|        401|      2|
|  1.0|        456|      2|
|  1.0|        711|      2|
+-----+-----------+-------+
only showing top 5 rows



23/03/06 04:02:15 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:15 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:27 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:27 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:27 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

35996

#### Creating a temp business table consist of business_id_int, name, city to join later with Predictions

In [43]:
temp_business = df1.drop('stars','user_id','user_id_int','business_id','hours','state')
temp_business = temp_business.dropDuplicates()

In [44]:
temp_business.show(5)

23/03/06 04:02:27 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:28 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:38 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:38 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:38 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+------------+--------------------+---------------+
|        city|                name|business_id_int|
+------------+--------------------+---------------+
|    Columbia|      Our House Cafe|              1|
|    O'Fallon|   1st Street Lounge|              2|
|Collinsville|           New China|              3|
|Collinsville|         Imo's Pizza|              4|
|Collinsville|Mungo's Italian E...|              5|
+------------+--------------------+---------------+
only showing top 5 rows



                                                                                

#### ALS Model Building

In [45]:
#split it to training and testing
training, test = df_final.randomSplit([0.80,0.20])

ALSExplicit = ALS( implicitPrefs=False, userCol="user_id", itemCol="business_id", ratingCol="stars",
          coldStartStrategy="drop")

defaultModel = ALSExplicit.fit(training)

23/03/06 04:02:39 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:39 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:40 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:40 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:02:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

In [46]:
paramMapExplicit = ParamGridBuilder() \
                    .addGrid(ALSExplicit.numUserBlocks, [8,12]) \
                    .addGrid(ALSExplicit.numItemBlocks, [8,12]) \
                    .addGrid(ALSExplicit.rank, [8,12]) \
                    .addGrid(ALSExplicit.maxIter, [5,10]) \
                    .addGrid(ALSExplicit.regParam, [0.1,1]) \
                    .addGrid(ALSExplicit.alpha, [1,2]) \
                    .build()

evaluatorR = RegressionEvaluator(metricName="rmse", labelCol="stars")

In [47]:
CVALSExplicit = CrossValidator(estimator=ALSExplicit,
                            estimatorParamMaps=paramMapExplicit,
                            evaluator=evaluatorR,
                           numFolds=5)

In [48]:
CVModelEXplicit = CVALSExplicit.fit(training)


23/03/06 04:03:10 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:03:10 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:03:10 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:03:10 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:06:12 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

In [49]:
predictions = CVModelEXplicit.bestModel.transform(test)
predictions.show(5)

23/03/06 04:17:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:17:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:17:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:17:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:17:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+-----+-----------+-------+----------+
|stars|business_id|user_id|prediction|
+-----+-----------+-------+----------+
|  1.0|          5|  11600| 3.2316794|
|  1.0|          6|   4159| 1.3960989|
|  1.0|          6|   9054| 1.5683389|
|  1.0|          6|  10723| 2.0660138|
|  1.0|          6|  12993| 1.7078671|
+-----+-----------+-------+----------+
only showing top 5 rows



                                                                                

In [50]:
CVModelEXplicit.bestModel.itemFactors.show(10, truncate = False)


+---+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                             |
+---+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|12 |[0.006014246, 0.7344656, 0.23632042, 0.59196264, -0.96515524, 0.31140926, 0.057777118, -0.07262838, 0.86538, -0.5636263, -0.37278834, -0.8770596]    |
|24 |[-0.053651974, 0.50591654, 0.20033768, 0.354029, -0.4423971, 0.056284502, 0.123376794, -0.07321217, 0.35457623, -0.4943899, -0.26719368, -0.72518426]|
|36 |[0.07841237, 0.7123844, 0.42860404, 0.71118313, -0.72378623, 0.21779601, 0.06527179, -0.26189005, 0.79190636, -0.48677278, -0.5241483, -0.9925142]   |
|48 |[-0.017310388, 0.57444215, 0.24286146, 0.35207152, -0.48881

In [51]:
#join on the temp business table we created earlier 
predictions = temp_business.join(predictions, predictions.business_id==df1.business_id_int, how='inner')

In [52]:
predictions = predictions.na.drop()
predictions.show(50, truncate = False)

23/03/06 04:18:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:18:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:18:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:18:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:18:02 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+----------------+------------------------------------+---------------+-----+-----------+-------+----------+
|city            |name                                |business_id_int|stars|business_id|user_id|prediction|
+----------------+------------------------------------+---------------+-----+-----------+-------+----------+
|Collinsville    |Mungo's Italian Eatery              |5              |1.0  |5          |11600  |3.2316794 |
|O'Fallon        |Buffalo Wild Wings                  |6              |1.0  |6          |4159   |1.3960989 |
|O'Fallon        |Buffalo Wild Wings                  |6              |1.0  |6          |9054   |1.5683389 |
|O'Fallon        |Buffalo Wild Wings                  |6              |1.0  |6          |10723  |2.0660138 |
|O'Fallon        |Buffalo Wild Wings                  |6              |1.0  |6          |12993  |1.7078671 |
|O'Fallon        |Buffalo Wild Wings                  |6              |1.0  |6          |16186  |1.72978   |
|O'Fallon        |S

#### Generate RMSE as evaluator

In [61]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="stars",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square error: {}".format(rmse))

23/03/06 04:21:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:21:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:21:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:21:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:21:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

Root-mean-square error: 1.6528341288796768


#### Generate R^2 as evaluator

In [62]:
evaluator = RegressionEvaluator(metricName="r2", labelCol="stars",
                                predictionCol="prediction", throughOrigin=True)
r2 = evaluator.evaluate(predictions)

print("r2 : {}".format(r2))

23/03/06 04:22:08 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:22:08 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:22:08 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:22:08 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/06 04:22:08 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

r2 : 0.8181540899458146


#### Generate top 5 business recommendations for each user


In [54]:
Business_rec_for_user = CVModelEXplicit.bestModel.recommendForAllUsers(10)


In [55]:
Business_rec_for_user.show(5)



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|     12|[{207, 2.5357628}...|
|     22|[{33, 4.382561}, ...|
|     26|[{207, 2.5900137}...|
|     27|[{207, 2.814681},...|
|     31|[{816, 4.735834},...|
+-------+--------------------+
only showing top 5 rows



                                                                                

#### Generate top 5 user recommendations for each business 


In [56]:
User_rec_for_business = CVModelEXplicit.bestModel.recommendForAllItems(10)

In [57]:
User_rec_for_business.show(5)



+-----------+--------------------+
|business_id|     recommendations|
+-----------+--------------------+
|         12|[{1963, 4.8463387...|
|         22|[{14476, 3.115163...|
|         26|[{2764, 4.3978014...|
|         27|[{13190, 2.541835...|
|         28|[{9620, 4.478644}...|
+-----------+--------------------+
only showing top 5 rows



                                                                                