In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
from pyspark.ml import pipeline
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.types import *

In [None]:
import findspark
findspark.init()
findspark.find()
import itertools
import pyspark
import sys
import time
import json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions  import date_format


spark = SparkSession \
    .builder \
    .appName("Content Module") \
    .getOrCreate()

sqlContext = SQLContext(spark)

In [None]:
business_df=spark.read.json('business.json')
user_df=spark.read.json('user.json')
review_df=spark.read.json('train_review (1).json')

print('The schema of the business json file is as follows')
business_df.printSchema()
print('\n\n')
print('The schema of the user json file is as follows')
user_df.printSchema()
print('\n\n')
print('The schema of the review json file is as follows')
review_df.printSchema()

The schema of the business json file is as follows
root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: str

### Spark ALS implementation requires the rating matrix to have the follwoing data types:
* ### ratings_df_schema = StructType(
* ### [StructField('userId', IntegerType()),
* ###  StructField('businessId', IntegerType()),
* ### StructField('rating', DoubleType())])

In [None]:
business_df.select('business_id').rdd.map(lambda x:x[0]).zipWithIndex().take(5)

In [None]:
b_idDict=business_df.select('business_id').rdd.map(lambda x:x[0]).zipWithIndex()
b_idDataFrame=sqlContext.createDataFrame(b_idDict,StructType([StructField("business_id", StringType(), True),StructField("businessId", IntegerType(), True)]))
b_idDataFrame.show(5)

In [None]:
b_idDataFrame.count()

In [None]:
u_idDict=user_df.select('user_id').rdd.map(lambda x:x[0]).zipWithIndex()
u_idDataFrame=sqlContext.createDataFrame(b_idDict,StructType([StructField("user_id", StringType(), True),StructField("userId", IntegerType(), True)]))
u_idDataFrame.show(5)

In [None]:
a = user_df.alias("a")
b = u_idDataFrame.alias("b")
    
user_new = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

user_new.select('userId','user_id', 'name').show(5,truncate=False)

In [None]:
# map new userId and businessId in the review dataframe

review_df = review_df.select('user_id', 'business_id', 'stars')


# map the userId
a = review_df.alias("a")
b = user_new.alias("b")
    
review1_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
                     .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

review1_df.show(5)

In [None]:
# map the businessId
a = review1_df.alias("a")
b = b_idDataFrame.alias("b")

final_review_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
                         .select([col('a.'+xx) for xx in a.columns] + [col('b.businessId')])

final_review_df.show(5,truncate=False)

In [None]:
rating_df = final_review_df.select('userId', 'businessId', final_review_df.stars.cast('float').alias('rating'))
rating_df.show(5)
print(' Rating matrx no. of rows :', rating_df.count())
rating_df.printSchema()

In [None]:
(train,test)=rating_df.randomSplit(weights=[0.8,0.2],seed=123)

In [None]:
als = ALS(userCol="userId", itemCol="businessId", ratingCol="rating", coldStartStrategy="drop")

param_grid = ParamGridBuilder().addGrid(
    als.rank,
    [10, 15, 20],
).addGrid(
    als.maxIter,
    [10, 15, 20],
).build()

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
)

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=123)
cv_als_model = cv.fit(train)

# Evaluate the model by computing the RMSE on the test data

als_predictions = cv_als_model.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("Root-mean-square error = " + str(rmse))

best_model = cv_als_model.bestModel

#best_rank is 20
best_model.rank

#best_maxIter is 20
(best_model
    ._java_obj     # Get Java object
    .parent()      # Get parent (ALS estimator)
    .getMaxIter()) # Get maxIter

In [None]:
# trying higher rank and maxIter need huge resources. try to tune regParam (default value = 0.1)

alsb = ALS(rank=20, maxIter=20, regParam=0.3, userCol="userId", itemCol="businessId", ratingCol="rating", \
               coldStartStrategy="drop", seed=123)
alsb_model = alsb.fit(train)

alsb_predictions = alsb_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(alsb_predictions)
print("Root-mean-square error = " + str(rmse))

# save the ALS model
alsb_model.write().overwrite().save(model_path + 'als')

# Root-mean-square error is 1.2579688933524986

In [None]:
# load a new instance of the saved ALS model
alsn_model = ALSModel.load(model_path + 'als')

In [None]:
userRecoms = alsn_model.recommendForAllUsers(10)

In [None]:
# add the column user_id, cache the recommendaton dataframe and show recommedations sample

a = userRecoms.alias("a")
b = user_newid_df.alias("b")
    
all_userRecoms = a.join(b, col("a.userId") == col("b.userId"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.user_id')])

all_userRecoms.cache()   
all_userRecoms.show(1, truncate=False)

In [None]:
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'

userFlatRec =  sqlContext.createDataFrame(all_userRecoms.filter(col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))
userFlatRec.show()

In [None]:
a = business_new_df.alias("a")
b = userFlatRec.alias("b")

user_collab_df = a.join(b, col("a.businessId") == col("b.businessId"), 'inner') \
                         .select([col('a.'+xx) for xx in a.columns] + [col('b.rating')])
    
user_collab_df.select('business_id', 'business_name', 'rating', 'categories').toPandas()

In [None]:
def getCollabRecom(u_id):
    

    userFlatRec =  sqlContext.createDataFrame(all_userRecoms.filter(col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))

    a = userFlatRec.alias("a")
    b = business_new_df.alias("b")
    
    return a.join(b, col("a.businessId") == col("b.businessId"), 'inner') \
             .select([col('b.business_id'), col('a.rating'), col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')]) \
             .orderBy("rating", ascending = False)

In [None]:
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'
getCollabRecom(u_id).toPandas()