## 使用Spark Mlib使用Als算法对MovieLens推荐

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSessionMoive").master("local[3]").getOrCreate()


In [4]:
# 获取SparkContext 实例对象
sc = spark.sparkContext
sc

### 读取数据

In [11]:
raw_rating_rdd = sc.textFile("ml-100k/u.data")

In [13]:
raw_rating_rdd.count()

100000

In [14]:
raw_rating_rdd.first()

'196\t242\t3\t881250949'

### 获取评分字段三个字段，构建Rating对象

In [15]:
rating_rdd = raw_rating_rdd.map(lambda line:line.split('\t')[:3])

In [16]:
rating_rdd.first()

['196', '242', '3']

In [17]:
from pyspark.mllib.recommendation import Rating,ALS,MatrixFactorizationModel

In [18]:
rating_datas = rating_rdd.map(lambda x:Rating(int(x[0]),int(x[1]),float(x[2])))

In [19]:
rating_datas.first()

Rating(user=196, product=242, rating=3.0)

In [20]:
# 多少电影
rating_datas.map(lambda r:r[1]).distinct().count()

1682

In [21]:
# 多少用户
rating_datas.map(lambda r:r[0]).distinct().count()


943

In [22]:
### 使用ALs 训练模型，显示评分

alsModel = ALS.train(rating_datas,10,iterations=10,lambda_=0.01)


### 因子矩阵

In [23]:
# 用户因子矩阵
user_feature_matrix = alsModel.userFeatures()

In [24]:
user_feature_matrix.take(10)

[(3,
  array('d', [0.5679934620857239, 1.1584008932113647, -0.01744152419269085, -0.2927170395851135, 1.2023059129714966, -0.8768662214279175, -0.5001089572906494, 0.23603278398513794, -0.5952816605567932, -0.044205863028764725])),
 (6,
  array('d', [0.24791546165943146, 0.37334519624710083, 0.3552496135234833, -0.954612672328949, 1.1089383363723755, 0.24255892634391785, 0.345123291015625, -0.44978687167167664, -0.03453666716814041, 0.7479900121688843])),
 (9,
  array('d', [0.07044503092765808, -0.45361030101776123, 0.27649033069610596, 0.7094711065292358, 1.623976707458496, -0.9232061505317688, -0.06899941712617874, -0.6374589800834656, -1.3819851875305176, 1.4251552820205688])),
 (12,
  array('d', [-0.4170413911342621, 0.1435934454202652, -0.08222322165966034, 0.37153246998786926, 2.1562912464141846, 0.4075556993484497, -0.4384736120700836, -0.7426102757453918, 0.1304176300764084, 0.6345940232276917])),
 (15,
  array('d', [-1.7315791845321655, 0.8038485050201416, 0.45321497321128845,

In [25]:
items_feature_matrix = alsModel.productFeatures()
items_feature_matrix.take(10)

[(3,
  array('d', [1.7671175003051758, 0.6949833631515503, -0.37843751907348633, 0.8160123229026794, 1.1652936935424805, -0.44613760709762573, 0.9945740103721619, -0.9841098785400391, -0.1395951509475708, 0.8973357081413269])),
 (6,
  array('d', [1.0796066522598267, 0.8354930281639099, -0.7928211688995361, -0.08586278557777405, 2.7411115169525146, 0.7135393023490906, -1.3845781087875366, -2.487330675125122, -0.03515879437327385, -0.24230197072029114])),
 (9,
  array('d', [0.5038389563560486, 0.7315208911895752, 0.6264982223510742, -0.6882349848747253, 1.8037089109420776, -0.6926113367080688, -0.32757997512817383, -0.8761651515960693, -8.400730439461768e-05, 0.7704496383666992])),
 (12,
  array('d', [0.1346028447151184, 0.0978146642446518, 0.031234920024871826, -0.6149269342422485, 2.060476303100586, -0.6492595076560974, 0.5750464797019958, -0.8115350008010864, 0.34140002727508545, 1.2026755809783936])),
 (15,
  array('d', [-0.42976143956184387, 0.5683112144470215, 0.3047786056995392, 0

### 预测电影的评分

In [27]:
"""
    假设 用户196 对242 的评分
"""
print()




In [28]:
predictRating = alsModel.predict(196,242)

In [29]:
predictRating

3.7579470665607837

### 推荐用户10个电影

In [30]:
alsModel.recommendProducts(196,10)

[Rating(user=196, product=697, rating=8.848426459912666),
 Rating(user=196, product=1195, rating=8.411406532002296),
 Rating(user=196, product=1159, rating=8.366264062379887),
 Rating(user=196, product=1164, rating=8.311640815652936),
 Rating(user=196, product=1185, rating=8.296127731458164),
 Rating(user=196, product=320, rating=8.229154024591214),
 Rating(user=196, product=962, rating=8.17995963621281),
 Rating(user=196, product=1313, rating=8.06875255927036),
 Rating(user=196, product=1470, rating=7.956241414325695),
 Rating(user=196, product=6, rating=7.9173682522716184)]

 ### 为电影推荐10个用户


In [31]:
rmdUsers = alsModel.recommendUsers(242,10)

In [32]:
rmdUsers

[Rating(user=98, product=242, rating=7.518467465424361),
 Rating(user=362, product=242, rating=6.705834625655482),
 Rating(user=928, product=242, rating=5.702171610993095),
 Rating(user=242, product=242, rating=5.5542468273521015),
 Rating(user=547, product=242, rating=5.525047844693219),
 Rating(user=212, product=242, rating=5.451140004742371),
 Rating(user=765, product=242, rating=5.324843534669615),
 Rating(user=563, product=242, rating=5.285712673350488),
 Rating(user=675, product=242, rating=5.271586266837323),
 Rating(user=443, product=242, rating=5.271164127631113)]

### 找到最佳模型

In [33]:
"""
    如何找到最佳模型？
    -模型评估？
    计算Rmse
    -b 模型优化
     -i 数据
     -ii 超参数，找到最有模型
     交叉验证
     k-folds
"""

print()


