In [10]:
import numpy as np
import pandas as pd


from pyspark.sql.functions import *
from pyspark.sql.types import *


from pyspark.sql import Row

from operator import add

from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml import Pipeline, PipelineModel

import folium
import html

class MyYelper:
    

    data_path = 'dataset/'
    model_path = 'model/'
    output_path = 'output/'
    
    max_recoms = 10

    def __init__(self, max_recoms):

        MyYelper.max_recoms = max_recoms
        
        
    def loadInputData(self):
        
        """загрузка данных,
           создаем DF для ALS и SQL вид

        Args:
                

        Reurns:
                
        """
        
        
        business_df = spark.read.parquet(MyYelper.data_path + 'business-small.parquet')
        user_df = spark.read.parquet(MyYelper.data_path + 'user-small.parquet')
        review_df = spark.read.parquet(MyYelper.data_path + 'review-small.parquet')
        friend_df = spark.read.parquet(MyYelper.data_path + 'friend-small.parquet')
        
        # для ALS
        user_newid_df = sqlContext.createDataFrame(user_df.rdd.map(lambda x: x[0]).zipWithIndex(), \
            StructType([StructField("user_id", StringType(), True),StructField("userId", IntegerType(), True)]))
        

        MyYelper.user_newid_df = user_newid_df
        MyYelper.user_newid_df.cache()
        print('no of users: ', MyYelper.user_newid_df.count())
        
        # создаем колонку userId
        a = user_df.alias("a")
        b = user_newid_df.alias("b")

        user_new_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
                     .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])
        
        user_df = user_new_df
        
        # формирование business_id для ALS
        business_newid_df = sqlContext.createDataFrame(business_df.rdd.map(lambda x: x[0]).zipWithIndex(), \
                StructType([StructField("business_id", StringType(), True), \
                            StructField("businessId", IntegerType(), True)]))


        
        # соединяем в один DF
        a = business_df.alias("a")
        b = business_newid_df.alias("b")
        
        business_new_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
                     .select([col('a.'+xx) for xx in a.columns] + [col('b.businessId')])

        business_df = business_new_df
        MyYelper.business_df = business_df
        MyYelper.business_df.cache()
        print('no of businesses: ', MyYelper.business_df.count())
            
        # маппинг пользователей их ревью
        a = review_df.alias("a")
        b = user_newid_df.alias("b")

        review_userId_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
                             .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

        # маппинг пользователей/отзывы к business_id
        a = review_userId_df.alias("a")
        b = business_newid_df.alias("b")

        review_userId_businessId_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
                                 .select([col('a.'+xx) for xx in a.columns] + [col('b.businessId')])

        # df с отзывами и пользователями 
        review_df = review_userId_businessId_df
        
        # представление sql
        business_df.createOrReplaceTempView("businesses")
        user_df.createOrReplaceTempView("users")
        review_df.createOrReplaceTempView("reviews")
        friend_df.createOrReplaceTempView("friends")
        
        
        
        
    def processReviewText(self):

        """текстовый pipeline, 
           получение данных из review,
           сохраняем модель для дальнейшего использования

        Args:
                

        Reurns:
                
        """
        
        reviews_text = spark.sql("SELECT business_id, review_text FROM reviews")

        # мэтчинг ревью на рестораны

        reviews_text_rdd = reviews_text.rdd
        reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
        reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
        reviews_by_business_df = reviews_by_business_df \
                                    .withColumnRenamed('_1', 'business_id') \
                                    .withColumnRenamed('_2', 'text')                                

        # pipeline
        regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
        stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
        countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
        iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
        word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
        vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
        pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

        # fit
        pipeline_mdl = pipeline.fit(reviews_by_business_df)

        #save 
        pipeline_mdl.write().overwrite().save(MyYelper.model_path + 'pipe_txt')
        
        
        

        
    def transformReviewText(self):
                                          

        """загрузка процесса pipeline,
           одна функция для построения businesses vectors

        Args:
                

        Reurns:
                
        """
        
        pipeline_mdl = PipelineModel.load(MyYelper.model_path + 'pipe_txt')
                                          
        reviews_text = spark.sql("SELECT business_id, review_text FROM reviews")
                                          
        # соединение 

        reviews_text_rdd = reviews_text.rdd
        reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
        reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
        reviews_by_business_df = reviews_by_business_df \
                                    .withColumnRenamed('_1', 'business_id') \
                                    .withColumnRenamed('_2', 'text')                                                      
        #транформ
        reviews_by_business_trf_df = pipeline_mdl.transform(reviews_by_business_df)
        
        # word_vec
        MyYelper.all_business_vecs = reviews_by_business_trf_df \
                                          .select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect() 
            
        # save
        business_vecs = reviews_by_business_trf_df.select('business_id', 'word_vec')
        business_vecs.write.mode('overwrite').parquet(MyYelper.output_path + 'business_vecs.parquet')


        
    def loadVecsAndModels(self):
        
        """load previously saved businesses vectors, and load the fitted text processing pipeline (use in key words search)

        Args:
                

        Reurns:
                
        """        
        # load previously saved business vecs
        business_vecs = sqlContext.read.parquet(MyYelper.output_path + 'business_vecs.parquet')
        
        # assign word_vec to a class list variable
        MyYelper.all_business_vecs = business_vecs \
                                          .select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
            
        # load trained text processing pipeline - will be used for key words search featurization    
        MyYelper.pipeline_mdl = PipelineModel.load(MyYelper.model_path + 'pipe_txt')  

        
        
        
        
    def calcCosineSim(self, vec1, vec2):
        
        """ perform cosine similarity between two vectors

        Args: 
                vec1: first vector
                vec2: second vector
                

        Reurns: cosine similarity score (number)
                
        """                
        
        return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

    
    
    
    def getBusinessDetails(self, in_bus):

        """ get businesses details like name, categories, lag, lat etc.

        Args: 
                in_bus: list of business ids
 
                

        Reurns: dataframe
                
        """       
        
        a = in_bus.alias("a")
        b = MyYelper.business_df.alias("b")

        return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
                 .select([col('a.'+xx) for xx in a.columns] + \
                         [col('b.business_name'),col('b.categories'),
                          col('b.stars'),col('b.review_count'),           
                          col('b.latitude'),col('b.longitude')])
    

    

    
    
    
    

    def getContentRecoms(self, u_id, max_recoms=MyYelper.max_recoms, return_map=True):

        """ create user profile, and get top-N conten-based filtering recommendations based on similarity to unseen restaurants 

        Args: 
                u_id: user id
                max_recoms: maximum no. of returned recommendation
                return_map : if True, returm foliummap, otherwise return a dataframe 
                

        Reurns: folium map or dataframe
                
        """    

        
        sim_bus_limit = 10
        
        # select restaurants previously reviewed (3+) by the user
        query = """
        SELECT distinct business_id FROM reviews  
        where stars >= 3.0 
        and user_id = "{}"
        """.format(u_id)

        usr_rev_bus = sqlContext.sql(query)

        # from these get sample of 5 restaurants
        usr_rev_bus = usr_rev_bus.sample(False, 0.5).limit(5)

        bus_list = [i.business_id for i in usr_rev_bus.collect()]

        
        schema = StructType([   
                        StructField("business_id", StringType(), True)
                        ,StructField("score", IntegerType(), True)
                    ])

        similar_businesses_df = spark.createDataFrame([], schema)
        
        all_business_vecs = MyYelper.all_business_vecs
        
        for b_id in bus_list:
          
            input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id][0]

            similar_business_rdd = sc.parallelize((i[0], \
                                            float(self.calcCosineSim(input_vec, i[1]))) for i in all_business_vecs)

            similar_business_df = spark.createDataFrame(similar_business_rdd) \
                .withColumnRenamed('_1', 'business_id') \
                .withColumnRenamed('_2', 'score') \
                .orderBy("score", ascending = False)

            similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(sim_bus_limit)
    
            similar_businesses_df = similar_businesses_df.union(similar_business_df)
        

        # filter out those have been reviewed before by the user
        s = similar_businesses_df.alias("s")
        r = usr_rev_bus.alias("r")
        j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
             .where(col("r.business_id").isNull()) \
             .select([col('s.business_id'),col('s.score')])

        # remove dublicates -- retain ones with the highest score
        j = j.groupby('business_id').agg(max('score').alias('score'))
        
        a = j.orderBy("score", ascending = False).limit(max_recoms)
        
        df = self.getBusinessDetails(a)
        
        df = df.withColumn('map_marker_colour', lit('green')) \
               .withColumn('recom_type', lit('Content'))
        
        if (return_map == True):
            
            mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

            for i, r in df.toPandas().iterrows():
                folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' \
                        + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color=r.map_marker_colour)).add_to(mp)
            return mp
        
        else:
            
            return df
    

    
    
    
    def getKeyWordsRecoms(self, key_words, max_recoms=MyYelper.max_recoms, return_map=True):


        """ transfom search text to vectors, and get top-N conten-based filtering recommendations based on similarity to unseen restaurants 

        Args: 
                key_words: key words or search text entered by user
                max_recoms: maximum no. of returned recommendation
                return_map : if True, returm foliummap, otherwise return a dataframe 
                

        Reurns: folium map or dataframe
                
        """   
        
        input_words_df = sc.parallelize([(0, key_words)]).toDF(['business_id', 'text'])

        # transform the the key words to vectors
        input_words_df = MyYelper.pipeline_mdl.transform(input_words_df)

        # select word2vec vectors into list
        input_key_words_vec = input_words_df.select('word_vec').collect()[0][0]

        # get similarity
        sim_bus_byword_rdd = sc.parallelize((i[0], float(self.calcCosineSim(input_key_words_vec, i[1]))) \
                                            for i in MyYelper.all_business_vecs)

        sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
             .withColumnRenamed('_1', 'business_id') \
             .withColumnRenamed('_2', 'score') \
             .orderBy("score", ascending = False)

        # return top 10 similar businesses
        a = sim_bus_byword_df.limit(max_recoms)
        
        
        df = self.getBusinessDetails(a)
        
        df = df.withColumn('map_marker_colour', lit('red')) \
               .withColumn('recom_type', lit('Key Words'))
        
        if (return_map == True):
            
            mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

            for i, r in df.toPandas().iterrows():
                folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' \
                        + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color=r.map_marker_colour)).add_to(mp)
            return mp
        
        else:
            
            return df


    
    
    
    def trainALS(self):
        

        """ train and save the ALS model. Used for collaborative filtering

        Args: 
              

        Reurns: 
                
        """   
        
                                          
        rating_df = spark.sql("SELECT userId, businessId, stars FROM reviews")
        rating_df = rating_df.select('userId', 'businessId', rating_df.stars.cast('float').alias('rating'))
                                          
        (train, test) = rating_df.randomSplit([0.8, 0.2], seed=123)
        
        alsb = ALS(rank=20, maxIter=20, regParam=0.3, userCol="userId", itemCol="businessId", ratingCol="rating", \
               coldStartStrategy="drop", seed=123)
                                          
        alsb_model = alsb.fit(train)
        alsb_predictions = alsb_model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
        rmse = evaluator.evaluate(alsb_predictions)
        print("Root-mean-square error = " + str(rmse))
                                          
        # save the ALS model
        alsb_model.write().overwrite().save(MyYelper.model_path + 'als')      
        
        
        
        

    def createALSrecoms(self, max_recoms=MyYelper.max_recoms):
        
        """ load the ALS model and collaborative filtering recommendations for all users

        Args: 
            max_recoms: maximum no. of returned recommendation
            
        Reurns: 
                
        """           
                                          
        # load a new instance of the saved ALS model
        alsn_model = ALSModel.load(MyYelper.model_path + 'als')
                                          
        userRecoms = alsn_model.recommendForAllUsers(max_recoms)
        
        a = userRecoms.alias("a")
        b = MyYelper.user_newid_df.alias("b")

        all_userRecoms = a.join(b, col("a.userId") == col("b.userId"), 'inner') \
                     .select([col('a.'+xx) for xx in a.columns] + [col('b.user_id')])


        MyYelper.all_userRecoms = all_userRecoms
        MyYelper.all_userRecoms.cache()
        print('no of collaborative recoms: ', MyYelper.all_userRecoms.count())
        #all_userRecoms.write.mode('overwrite').parquet(MyYelper.output_path + 'als_recoms.parquet')

        
        
        
        
    def getCollabrRecoms(self, u_id, return_map=True):

        """ get top-N collaborative filtering recommendations for a specific user 

        Args: 
                u_id: user id
                return_map : if True, returm foliummap, otherwise return a dataframe 
                

        Reurns: folium map or dataframe
                
        """  
        

        userFlatRec =  sqlContext.createDataFrame(MyYelper.all_userRecoms \
                                                  .filter(col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))

        a = userFlatRec.alias("a")
        b = MyYelper.business_df.alias("b")

        df = a.join(b, col("a.businessId") == col("b.businessId"), 'inner') \
                 .select([col('b.business_id'), col('a.rating'), col('b.business_name'),col('b.categories'),
                                                               col('b.stars'),col('b.review_count'),
                                                               col('b.latitude'),col('b.longitude')]) \
                 .orderBy("rating", ascending = False)
        
        df = df.withColumnRenamed('rating', 'score') \
               .withColumn('map_marker_colour', lit('blue')) \
               .withColumn('recom_type', lit('Collaborative'))
        
        if (return_map == True):
            
            mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

            for i, r in df.toPandas().iterrows():
                folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' \
                        + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color=r.map_marker_colour)).add_to(mp)
            return mp
        
        else:
            
            return df
 




    def getFriendsRecoms(self, u_id, max_recoms=MyYelper.max_recoms, return_map=True):

        """ get top-N friends recommendations for a specific user 

        Args: 
                u_id: user id
                max_recoms: maximum no. of returned recommendation
                return_map : if True, returm foliummap, otherwise return a dataframe 
                

        Reurns: folium map or dataframe
                
        """  
        
        query = """
        select business_id, count(*) as score 
        from reviews
        where user_id in
            (select f.friend_id from friends f
            inner join users u on f.friend_id = u.user_id
            where f.user_id = "{}") 
        and stars >= 4 
        and business_id not in (select business_id from reviews where user_id = "{}")
        group by business_id
        order by count(*) desc limit 100
        """.format(u_id, u_id)

        friend_recoms_df = sqlContext.sql(query)

        a = friend_recoms_df.sample(False, 0.5).limit(max_recoms)

        df = self.getBusinessDetails(a).orderBy("score", ascending = False)
        
        df = df.withColumn('map_marker_colour', lit('orange')) \
               .withColumn('recom_type', lit('Friend'))
        
        if (return_map == True):
            
            mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

            for i, r in df.toPandas().iterrows():
                folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' \
                        + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color=r.map_marker_colour)).add_to(mp)
            return mp
        
        else:
            
            return df

        
        
    def getHybridRecoms(self, u_id, contentMax=4, collabrMax=4, friendsMax=2, return_map=True):

        """ get hybrid recommendations for a specific user by selecting top-N from each sub-recommender

        Args: 
                u_id: user id
                contentMax: maximum no. of returned content-based filtering recommendations
                collabrMax: maximum no. of returned collaborative filtering recommendations
                friendsMax: maximum no. of returned friends network recommendations
                return_map : if True, returm foliummap, otherwise return a dataframe 
                

        Reurns: folium map or dataframe
                
        """  
        
        contentRecoms = self.getContentRecoms(u_id, return_map=False)
        collabrRecoms = self.getCollabrRecoms(u_id, return_map=False)
        friendsRecoms = self.getFriendsRecoms(u_id, return_map=False)
        
        df = contentRecoms.limit(contentMax) \
                          .union(collabrRecoms.limit(collabrMax)) \
                          .union(friendsRecoms.limit(friendsMax))
                
        if (return_map == True):
            
            mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

            for i, r in df.toPandas().iterrows():
                folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' \
                        + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color=r.map_marker_colour)).add_to(mp)
            return mp
        
        else:
            
            return df
        

In [11]:
# initialize the hybrid engine, load the input data, saved ML models and besnesses vectors, and prepare collaborative recommendations

myyelper = MyYelper(10)
myyelper.loadInputData()
myyelper.loadVecsAndModels()
myyelper.createALSrecoms()

no of users:  66424
no of businesses:  6750
no of collaborative recoms:  59131


In [12]:
# test key word search recommendations - return map

key_words = 'cheese burger and steak'

myyelper.getKeyWordsRecoms(key_words)


In [13]:
# test content-based filtering recommendations for a specifc user - return map

uid = 'Wc5L6iuvSNF5WGBlqIO8nw'

myyelper.getContentRecoms(uid)


In [14]:
# test collaborative filtering recommendations for a specifc user - return map

uid = 'Wc5L6iuvSNF5WGBlqIO8nw'

myyelper.getCollabrRecoms(uid)

In [15]:
# test collaborative recommendations for a specifc user - return dataframe

uid = 'Wc5L6iuvSNF5WGBlqIO8nw'

dfcol = myyelper.getCollabrRecoms(uid, return_map=False)
dfcol.toPandas()

Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude,map_marker_colour,recom_type
0,LcIgUlWaJJwtOfPoPWCmBg,5.566549,Souppe Shoppe,"[Restaurants, Street Vendors, Food, Soup, Food...",5.0,4,43.651425,-79.404123,blue,Collaborative
1,mpDcuUs6dB5uBsYVKDWCNQ,5.435555,Druxy's Famous Deli,"[Restaurants, Sandwiches, Delis, Breakfast & B...",4.0,4,43.648235,-79.379525,blue,Collaborative
2,otsjAjxf0PNQ99xcmuj_LA,5.164353,Sushi Making For the Soul,"[Restaurants, Japanese]",4.5,3,43.656233,-79.392319,blue,Collaborative
3,v_OLzcpFA7vgVp30vxv2uQ,5.145496,Silver Spoon,"[Restaurants, American (New), Canadian (New)]",5.0,4,43.650883,-79.450832,blue,Collaborative
4,vAz5pelrjwkpMDo_OHCDAg,5.115032,Kuya Willie's Kainan,"[Breakfast & Brunch, Filipino, Restaurants]",3.5,3,43.759288,-79.310866,blue,Collaborative
5,LIjlU7K-0SPXPtYFQiXamQ,5.078496,Magic Oven,"[Food Stands, Sandwiches, Restaurants, Indian]",5.0,3,43.652294,-79.405521,blue,Collaborative
6,3CSypkv-tJsf-CzXc3qoXQ,5.049433,Village Meat Products & Deli,"[Specialty Food, Meat Shops, Restaurants, Food...",5.0,5,43.6513,-79.450729,blue,Collaborative
7,9GLN1xfck07CKfNfejKCwg,5.047345,T-Sushi,"[Food, Restaurants, Sushi Bars, Food Delivery ...",5.0,13,43.644745,-79.390892,blue,Collaborative
8,VBMJjX1rPuwVvzTApRJZAA,4.998397,Hot Pot Restaurant,"[Caribbean, Restaurants, Food]",5.0,12,43.697467,-79.441352,blue,Collaborative
9,1VAsBosvx02jpvIUxiKvmg,4.979173,The Dumpling Shop,"[Restaurants, Specialty Food, Chinese, Dim Sum...",4.5,11,43.767971,-79.401363,blue,Collaborative


In [16]:
# test friends network recommendations for a specifc user - return map


uid = 'Wc5L6iuvSNF5WGBlqIO8nw'

myyelper.getFriendsRecoms(uid)

In [17]:
# test hybrid recommendations for a specifc user

uid = 'Wc5L6iuvSNF5WGBlqIO8nw'

myyelper.getHybridRecoms(uid)