## 使用Spark Ml中的Als算法对MovieLens推荐

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSessionMoive").master("local[3]").getOrCreate()


In [2]:
# 获取SparkContext 实例对象
sc = spark.sparkContext
sc

### 读取数据

In [15]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
# 指定Schema信息
rating_schema = StructType([StructField("userId", IntegerType(), True),
                           StructField("movieId", IntegerType(), True),
                           StructField("rating", DoubleType(), True),
                           StructField("timestamp", StringType(), True)])

In [16]:
raw_rating_df = spark.read.schema(rating_schema).csv("ml-100k/u.data", sep='\t')

In [17]:
raw_rating_df.count()

100000

In [18]:
raw_rating_df.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|   3.0|881250949|
|   186|    302|   3.0|891717742|
|    22|    377|   1.0|878887116|
|   244|     51|   2.0|880606923|
|   166|    346|   1.0|886397596|
|   298|    474|   4.0|884182806|
|   115|    265|   2.0|881171488|
|   253|    465|   5.0|891628467|
|   305|    451|   3.0|886324817|
|     6|     86|   3.0|883603013|
+------+-------+------+---------+
only showing top 10 rows



In [19]:
raw_rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)



### 使用ALS算法训练模型

In [20]:
# 导入模块
from pyspark.ml.recommendation import ALS

In [22]:
# 构建ALS实例对象
"""
算法中默认值：
rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
                         implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item",
                         ratingCol="rating", nonnegative=False, checkpointInterval=10,
                         intermediateStorageLevel="MEMORY_AND_DISK",
                         finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan"
"""
als = ALS()

In [23]:
# 设置 模型学习器ALS使用DataFrame中那几列数据进行训练
als.setUserCol('userId')
als.setItemCol('movieId')
als.setRatingCol('rating')

ALS_df55b858bfd9

In [24]:
# 设置ALS模型学习器中超参数
"""
ALS 算法来说：
    - 特征数: rank
    - 迭代次数：iterations
"""
als.setRank(10)
als.setMaxIter(15)
als.setImplicitPrefs(False)

ALS_df55b858bfd9

In [26]:
# 使用数据集应用到模型学习器，得到模型（转换器）
als_model = als.fit(raw_rating_df)

In [27]:
print(type(als_model))

<class 'pyspark.ml.recommendation.ALSModel'>


In [28]:
# 
from pyspark.ml.recommendation import ALSModel

In [29]:
# 获取ALSModel中物品因子矩阵
item_factors = als_model.itemFactors
print(type(item_factors))

<class 'pyspark.sql.dataframe.DataFrame'>


In [30]:
# 获取ALSModel中用户因子矩阵
user_factors = als_model.userFactors
print(type(user_factors))

<class 'pyspark.sql.dataframe.DataFrame'>


In [31]:
user_factors.show(10,False)

+---+------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                      |
+---+------------------------------------------------------------------------------------------------------------------------------+
|10 |[-0.6225715, -0.8438259, 0.53652626, -0.87513846, -0.5097637, -1.4014138, 0.16817763, -0.10034704, -0.3019759, -0.7426551]    |
|20 |[-0.8465669, -1.3644813, 1.1620916, -0.3416207, -0.07504084, -0.38107634, 0.07485855, -0.31202236, 0.51366293, -0.5041428]    |
|30 |[-0.6043329, -0.97526544, 0.40774328, -1.1256114, -0.70766014, -1.0653867, -0.10305666, -0.075621665, -0.55529684, -0.7953676]|
|40 |[-0.5652873, -0.825111, 0.36425748, -0.9720523, -0.9000827, -0.7889517, -0.4281875, 0.012759478, 0.60283214, -0.087252714]    |
|50 |[-0.7998391, -0.36401668, -0.34848952, -0.3899193, -1.25849, -1.

### 使用模型（转换器）预测与推荐

In [32]:
# 针对某个用户对某个物品的预测评分:  196 -> 242
test_df = spark.createDataFrame([(196, 242)], ["userId", "movieId"])

predict_df = als_model.transform(test_df)

In [33]:
predict_df.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   196|    242|  3.769358|
+------+-------+----------+



In [34]:
# 使用模型学习器进行推荐
# max number of recommendations for each user
rmd_items = als_model.recommendForAllUsers(5)

In [35]:
rmd_items.show(10, False)

+------+-------------------------------------------------------------------------------------------+
|userId|recommendations                                                                            |
+------+-------------------------------------------------------------------------------------------+
|471   |[[394, 4.932488], [989, 4.916443], [1217, 4.7986455], [353, 4.7336373], [342, 4.71447]]    |
|463   |[[1240, 4.446023], [850, 4.302981], [958, 4.286317], [408, 4.2617993], [114, 4.23605]]     |
|833   |[[1368, 4.810341], [1512, 4.6749277], [320, 4.5731263], [1597, 4.414792], [641, 4.328008]] |
|496   |[[1589, 4.4463654], [75, 4.350847], [1388, 4.152881], [42, 4.144475], [838, 4.133501]]     |
|148   |[[793, 5.023424], [1367, 5.0215263], [169, 4.9850044], [408, 4.9648523], [50, 4.9319296]]  |
|540   |[[1449, 4.9776034], [1642, 4.811192], [1398, 4.742763], [1122, 4.6948037], [1467, 4.61261]]|
|392   |[[1463, 5.634221], [1643, 5.34472], [1449, 5.0971856], [483, 4.9610443], [1398, 4.9

In [36]:
# 使用模型学习器为物品推荐用户（推荐4用户）
als_model.recommendForAllItems(4).show(10, False)

+-------+------------------------------------------------------------------------+
|movieId|recommendations                                                         |
+-------+------------------------------------------------------------------------+
|1580   |[[688, 1.0908505], [589, 1.0816689], [152, 1.0683241], [38, 1.0672272]] |
|471    |[[688, 5.0383673], [507, 4.9686756], [628, 4.8877163], [849, 4.783285]] |
|1591   |[[34, 5.0194407], [204, 4.9763966], [519, 4.9293365], [440, 4.849055]]  |
|1342   |[[928, 4.0866957], [810, 3.892243], [427, 3.8740363], [662, 3.8226976]] |
|463    |[[810, 5.1099505], [157, 4.81986], [270, 4.766249], [770, 4.7517185]]   |
|833    |[[887, 5.040162], [507, 4.6668177], [427, 4.60401], [137, 4.575993]]    |
|1645   |[[628, 5.682767], [928, 5.551585], [507, 5.4509673], [4, 5.439583]]     |
|496    |[[688, 5.6025324], [810, 5.4708586], [152, 5.269155], [732, 5.1957483]] |
|148    |[[127, 5.0616274], [507, 4.958015], [849, 4.704055], [907, 4.6265583]]  |
|108

### 模型保存与加载

In [37]:
# 保存 ALSModel
als_model.save('./als-ml-model')

In [38]:
from pyspark.ml.recommendation import ALSModel
# 加载模型
load_als_model = ALSModel.load('./als-ml-model')

In [39]:
load_als_model\
    .transform(spark.createDataFrame([(196, 242)], ["userId", "movieId"]))\
    .show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   196|    242|  3.769358|
+------+-------+----------+



### 模型评估

In [40]:
# 导入模块
from pyspark.ml.evaluation import RegressionEvaluator

In [42]:
# 使用加载得到的模型进行预测
predict_rating_df = load_als_model.transform(raw_rating_df)

In [43]:
predict_rating_df.show(10, False)

+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|196   |242    |3.0   |881250949|3.769358  |
|186   |302    |3.0   |891717742|3.164205  |
|22    |377    |1.0   |878887116|1.1317086 |
|244   |51     |2.0   |880606923|3.354837  |
|166   |346    |1.0   |886397596|2.0713725 |
|298   |474    |4.0   |884182806|4.062204  |
|115   |265    |2.0   |881171488|3.208923  |
|253   |465    |5.0   |891628467|3.7699301 |
|305   |451    |3.0   |886324817|2.9533622 |
|6     |86     |3.0   |883603013|3.507125  |
+------+-------+------+---------+----------+
only showing top 10 rows



In [44]:
evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction')

In [47]:
"""
    metricName = Param(Params._dummy(), "metricName",
                       metric name in evaluation - one of:
                       rmse - root mean squared error (default)
                       mse - mean squared error
                       r2 - r^2 metric
                       mae - mean absolute error.,
                       typeConverter=TypeConverters.toString)
"""
evaluator.evaluate(predict_rating_df, {evaluator.metricName: 'rmse'})

0.7665513159768549

In [48]:
evaluator.evaluate(predict_rating_df, {evaluator.metricName: 'r2'})

0.5362735701331607

In [49]:
spark.stop()