In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import *


from pyspark.sql import Row

from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

import folium
import html

### Data Loading

In [3]:
# business DF
business_df = spark.read.parquet('business-small.parquet')

business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [4]:
# user DF
user_df = spark.read.parquet(d'user-small.parquet')

user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [5]:
# review DF
review_df = spark.read.parquet('review-small.parquet')

review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



#### Spark ALS реализуется на следующих данных:

```
ratings_df_schema = StructType(
[StructField('userId', IntegerType()),
 StructField('businessId', IntegerType()),
 StructField('rating', DoubleType())]
)
```

Сделаем маппинг user_id и business_id

In [9]:
# колонка userId согласно структуре 

user_newid_df = (sqlContext.createDataFrame(user_df.rdd.map(lambda x: x[0]).zipWithIndex(), 
                                            StructType([StructField("user_id", StringType(), True),
                                                        StructField("userId", IntegerType(), True)]))
                )

user_newid_df.show(2)

+--------------------+------+
|             user_id|userId|
+--------------------+------+
|om5ZiponkpRqUNa3p...|     0|
|Wc5L6iuvSNF5WGBlq...|     1|
+--------------------+------+
only showing top 2 rows



In [10]:
# добавим userId к существующим данным

a = user_df.alias("a")
b = user_newid_df.alias("b")
    
user_new_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

user_new_df.select('userId','user_id', 'user_name').show(2)

+------+--------------------+---------+
|userId|             user_id|user_name|
+------+--------------------+---------+
|     0|om5ZiponkpRqUNa3p...|   Andrea|
|     1|Wc5L6iuvSNF5WGBlq...|     Risa|
+------+--------------------+---------+
only showing top 2 rows



In [11]:
# businessId 
business_newid_df = (
                    sqlContext.createDataFrame(business_df.rdd.map(lambda x: x[0]).zipWithIndex(), 
                                               StructType([StructField("business_id", StringType(), True),
                                                           StructField("businessId", IntegerType(), True)]))

business_newid_df.show(2)

+--------------------+----------+
|         business_id|businessId|
+--------------------+----------+
|qim0lD112TkDhm8Zy...|         0|
|Wf5C8Amv_SlhoYE3_...|         1|
+--------------------+----------+
only showing top 2 rows



In [12]:
# повторим действие

a = business_df.alias("a")
b = business_newid_df.alias("b")
    
business_new_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.businessId')])

business_new_df.select('businessId','business_id', 'business_name').show(2)

+----------+--------------------+--------------------+
|businessId|         business_id|       business_name|
+----------+--------------------+--------------------+
|         0|qim0lD112TkDhm8Zy...|McCarthy's Irish Pub|
|         1|Wf5C8Amv_SlhoYE3_...|         Oishi Sushi|
+----------+--------------------+--------------------+
only showing top 2 rows



In [13]:
# сделаем маппинг userId и businessId на review DF

review_df = review_df.select('user_id', 'business_id', 'stars')


# map userId
a = review_df.alias("a")
b = user_newid_df.alias("b")
    
review_userId_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
                     .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

# map businessId
a = review_userId_df.alias("a")
b = business_newid_df.alias("b")

review_userId_businessId_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
                         .select([col('a.'+xx) for xx in a.columns] + [col('b.businessId')])

review_userId_businessId_df.show(2)

+--------------------+--------------------+-----+------+----------+
|             user_id|         business_id|stars|userId|businessId|
+--------------------+--------------------+-----+------+----------+
|u642WP1g6Z3oRA9qd...|1RFIVcZYV77tGIwVV...|    5| 23561|       872|
|CGmWH1Nwx1hbasHqo...|1RFIVcZYV77tGIwVV...|    4|  6268|       872|
+--------------------+--------------------+-----+------+----------+
only showing top 2 rows



### Collaborative Filtering

In [16]:
# создадим DF для ALS

rating_df = review_userId_businessId_df.select('userId', 'businessId', review_userId_businessId_df.stars.cast('float').alias('rating'))
rating_df.show(2)
print(' Rating matrx no. of rows :', rating_df.count())
rating_df.printSchema()

+------+----------+------+
|userId|businessId|rating|
+------+----------+------+
| 23561|       872|   5.0|
|  6268|       872|   4.0|
+------+----------+------+
only showing top 2 rows

 Rating matrx no. of rows : 276887
root
 |-- userId: integer (nullable = true)
 |-- businessId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [15]:
rating_df.limit(5).toPandas()

Unnamed: 0,userId,businessId,rating
0,23561,872,5.0
1,6268,872,4.0
2,8646,872,4.0
3,531,4253,3.0
4,2217,4253,5.0


In [15]:
# трейн тест разделение
(train, test) = rating_df.randomSplit([0.8, 0.2], seed=123)

In [None]:
# Cross Validation для ALS

als = ALS(userCol="userId", itemCol="businessId", ratingCol="rating", coldStartStrategy="drop")

param_grid = ParamGridBuilder().addGrid(
                                        als.rank,
                                        [10, 15, 20],
                                    ).addGrid(
                                        als.maxIter,
                                        [10, 15, 20],
                                    ).build()

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
)

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=123)
cv_als_model = cv.fit(train)

# RMSE для валидации
als_predictions = cv_als_model.bestModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(als_predictions)
print("Root-mean-square error = " + str(rmse))

best_model = cv_als_model.bestModel

#получим лучшие значения
best_model.rank

#best_maxIter
(best_model
    ._java_obj     # Java object
    .parent()      # ALS estimator
    .getMaxIter()) # maxIter


In [None]:
# тюниг модели
# большой rank и  большое значение maxIter будут требовать много ресурсов и времени.
# начните с настройки regParam (default value = 0.1)

alsb = ALS(rank=20, maxIter=20, regParam=0.3, userCol="userId", itemCol="businessId", ratingCol="rating", \
               coldStartStrategy="drop", seed=123)
alsb_model = alsb.fit(train)

alsb_predictions = alsb_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(alsb_predictions)
print("Root-mean-square error = " + str(rmse))

# save the ALS model
alsb_model.write().overwrite().save('als')



In [17]:
# Загрузка модели
alsn_model = ALSModel.load(model_path + 'als')

In [18]:
# Создаем 10 лучших рекомендаций
userRecoms = alsn_model.recommendForAllUsers(10)


In [19]:
# добавляем рекомендации к DF

a = userRecoms.alias("a")
b = user_newid_df.alias("b")
    
all_userRecoms = a.join(b, col("a.userId") == col("b.userId"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.user_id')])

all_userRecoms.cache()   
all_userRecoms.show(1, truncate=False)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|userId|recommendations                                                                                                                                                             |user_id               |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|148   |[[4592,4.6922364], [2432,4.614214], [401,4.4767885], [6596,4.439251], [2848,4.3852043], [810,4.359094], [4267,4.340767], [1410,4.307626], [4390,4.2542963], [4408,4.245825]]|_K9sKlA4fVkWI4hyGSpoPA|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# тестируем рекомендации для одного пользователя
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'

userFlatRec =  sqlContext.createDataFrame(all_userRecoms.filter(col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))
userFlatRec.show()

+----------+------------------+
|businessId|            rating|
+----------+------------------+
|      2432| 4.570364475250244|
|      4592| 4.527902126312256|
|      2668| 4.490123271942139|
|       401| 4.438344955444336|
|       810| 4.414823055267334|
|      2028|4.4012932777404785|
|      3237| 4.399106025695801|
|      4133| 4.392766952514648|
|      4408| 4.377004623413086|
|      5664| 4.362335205078125|
+----------+------------------+



In [21]:
# смотрим рекомендации и детали по ним
a = business_new_df.alias("a")
b = userFlatRec.alias("b")

user_collab_df = a.join(b, col("a.businessId") == col("b.businessId"), 'inner') \
                         .select([col('a.'+xx) for xx in a.columns] + [col('b.rating')])
    
user_collab_df.select('business_id', 'business_name', 'rating', 'categories').toPandas()

Unnamed: 0,business_id,business_name,rating,categories
0,2H5EaBEreDzzP7sPmD_oDQ,Vila Verde,4.362335,"[Restaurants, Event Planning & Services, Portu..."
1,mpDcuUs6dB5uBsYVKDWCNQ,Druxy's Famous Deli,4.527902,"[Restaurants, Sandwiches, Delis, Breakfast & B..."
2,1VAsBosvx02jpvIUxiKvmg,The Dumpling Shop,4.490123,"[Restaurants, Specialty Food, Chinese, Dim Sum..."
3,XKa5R1lJSvNrbo8InhNliQ,Toronto Star Food Building,4.399106,"[Food, Fast Food, Restaurants]"
4,LcIgUlWaJJwtOfPoPWCmBg,Souppe Shoppe,4.570364,"[Restaurants, Street Vendors, Food, Soup, Food..."
5,vAz5pelrjwkpMDo_OHCDAg,Kuya Willie's Kainan,4.414823,"[Breakfast & Brunch, Filipino, Restaurants]"
6,9GLN1xfck07CKfNfejKCwg,T-Sushi,4.438345,"[Food, Restaurants, Sushi Bars, Food Delivery ..."
7,y9yeMK6N0UINVECI3Ijz3Q,Hot Dog Stand,4.401293,"[Hot Dogs, Restaurants]"
8,LIjlU7K-0SPXPtYFQiXamQ,Magic Oven,4.392767,"[Food Stands, Sandwiches, Restaurants, Indian]"
9,fxRcHzovnRyWh_WMdQoNOQ,Taj Restaurant,4.377005,"[Restaurants, Russian, Mediterranean]"


In [23]:
def showInMap(df):
    mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

    for i, r in df.toPandas().iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color='blue')).add_to(mp)

    return mp

In [24]:
def getCollabRecom(u_id):
    userFlatRec =  sqlContext.createDataFrame(all_userRecoms.filter(col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))

    a = userFlatRec.alias("a")
    b = business_new_df.alias("b")
    
    return a.join(b, col("a.businessId") == col("b.businessId"), 'inner') \
             .select([col('b.business_id'), col('a.rating'), col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')]) \
             .orderBy("rating", ascending = False)
    

In [25]:
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'

getCollabRecom(u_id).toPandas()

Unnamed: 0,business_id,rating,business_name,categories,stars,review_count,latitude,longitude
0,LcIgUlWaJJwtOfPoPWCmBg,4.570364,Souppe Shoppe,"[Restaurants, Street Vendors, Food, Soup, Food...",5.0,4,43.651425,-79.404123
1,mpDcuUs6dB5uBsYVKDWCNQ,4.527902,Druxy's Famous Deli,"[Restaurants, Sandwiches, Delis, Breakfast & B...",4.0,4,43.648235,-79.379525
2,1VAsBosvx02jpvIUxiKvmg,4.490123,The Dumpling Shop,"[Restaurants, Specialty Food, Chinese, Dim Sum...",4.5,11,43.767971,-79.401363
3,9GLN1xfck07CKfNfejKCwg,4.438345,T-Sushi,"[Food, Restaurants, Sushi Bars, Food Delivery ...",5.0,13,43.644745,-79.390892
4,vAz5pelrjwkpMDo_OHCDAg,4.414823,Kuya Willie's Kainan,"[Breakfast & Brunch, Filipino, Restaurants]",3.5,3,43.759288,-79.310866
5,y9yeMK6N0UINVECI3Ijz3Q,4.401293,Hot Dog Stand,"[Hot Dogs, Restaurants]",4.0,3,43.681236,-79.377222
6,XKa5R1lJSvNrbo8InhNliQ,4.399106,Toronto Star Food Building,"[Food, Fast Food, Restaurants]",4.5,3,43.632265,-79.420313
7,LIjlU7K-0SPXPtYFQiXamQ,4.392767,Magic Oven,"[Food Stands, Sandwiches, Restaurants, Indian]",5.0,3,43.652294,-79.405521
8,fxRcHzovnRyWh_WMdQoNOQ,4.377005,Taj Restaurant,"[Restaurants, Russian, Mediterranean]",5.0,4,43.696764,-79.446227
9,2H5EaBEreDzzP7sPmD_oDQ,4.362335,Vila Verde,"[Restaurants, Event Planning & Services, Portu...",4.0,4,43.651243,-79.410631


In [26]:
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'
showInMap(getCollabRecom(u_id))