## 使用Spark Mlib使用Als算法对MovieLens推荐

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSessionMoive").master("local[3]").getOrCreate()


In [2]:
# 获取SparkContext 实例对象
sc = spark.sparkContext
sc

### 读取数据

In [3]:
raw_rating_rdd = sc.textFile("ml-100k/u.data")

In [4]:
raw_rating_rdd.count()

100000

In [5]:
raw_rating_rdd.first()

'196\t242\t3\t881250949'

### 获取评分字段三个字段，构建Rating对象

In [6]:
rating_rdd = raw_rating_rdd.map(lambda line:line.split('\t')[:3])

In [7]:
rating_rdd.first()

['196', '242', '3']

In [8]:
from pyspark.mllib.recommendation import Rating,ALS,MatrixFactorizationModel

In [9]:
rating_datas = rating_rdd.map(lambda x:Rating(int(x[0]),int(x[1]),float(x[2])))

In [10]:
rating_datas.first()

Rating(user=196, product=242, rating=3.0)

In [11]:
# 多少电影
rating_datas.map(lambda r:r[1]).distinct().count()

1682

In [12]:
# 多少用户
rating_datas.map(lambda r:r[0]).distinct().count()


943

In [24]:
training_rating,test_rating = rating_datas.randomSplit([0.8,0.2])
print(training_rating.count())
print(test_rating.count())

80037
19963


In [25]:
### 使用ALs 训练模型，显示评分

alsModel = ALS.train(training_rating,10,iterations=10,lambda_=0.01)


### 因子矩阵

In [26]:
# 用户因子矩阵
user_feature_matrix = alsModel.userFeatures()

In [27]:
user_feature_matrix.take(10)

[(3,
  array('d', [0.26273101568222046, -0.7966914772987366, 0.5379903316497803, 0.6863815784454346, 1.4810526371002197, 1.0480250120162964, -1.002285122871399, 0.8649678230285645, 1.0076770782470703, -0.409518301486969])),
 (6,
  array('d', [-0.20214079320430756, -0.6967804431915283, -0.5829652547836304, 0.7341719269752502, 1.1057270765304565, 0.40645018219947815, -0.15274442732334137, 0.045949287712574005, -0.36013802886009216, 0.5093827247619629])),
 (9,
  array('d', [-1.3699228763580322, -0.43036577105522156, -0.36677050590515137, 0.16159939765930176, 0.7613000273704529, 1.4941519498825073, 0.15905465185642242, 1.0225193500518799, 1.525156021118164, 0.7727831602096558])),
 (12,
  array('d', [-0.1039961501955986, -0.17539279162883759, -1.7739858627319336, -0.1035904511809349, 0.024163661524653435, 1.2499287128448486, 0.6960067749023438, 0.4165094792842865, -0.522911012172699, 0.43696433305740356])),
 (15,
  array('d', [1.1873477697372437, 0.60807865858078, -0.35449495911598206, 0.05

In [28]:
items_feature_matrix = alsModel.productFeatures()
items_feature_matrix.take(10)

[(3,
  array('d', [0.639057457447052, -0.7139704823493958, -1.2860432863235474, 0.7057095170021057, 0.5718122720718384, 0.8859282732009888, 0.48903515934944153, 0.8280932903289795, -0.17675898969173431, -1.5686227083206177])),
 (6,
  array('d', [-0.8623830080032349, -0.34811490774154663, -0.9198046922683716, -0.5609152913093567, 3.1571204662323, -0.27579477429389954, -0.28691333532333374, 1.3063271045684814, 0.3578945994377136, -0.38141077756881714])),
 (9,
  array('d', [-0.11969849467277527, -0.5340414643287659, -1.6247047185897827, 0.7910805344581604, 1.5078996419906616, 1.4247907400131226, -0.5407723784446716, 0.12178927659988403, -0.30500853061676025, -0.35088956356048584])),
 (12,
  array('d', [0.4556500315666199, -0.28397276997566223, -1.9089922904968262, 1.0745095014572144, 1.4100193977355957, 1.2386577129364014, 0.12190335988998413, -0.14721651375293732, 0.1453382968902588, 0.05340651422739029])),
 (15,
  array('d', [0.25318643450737, -0.009844603948295116, -1.1379681825637817,

### 预测电影的评分

In [29]:
"""
    假设 用户196 对242 的评分
"""
print()




In [30]:
predictRating = alsModel.predict(196,242)

In [31]:
predictRating

4.522511606950817

### 推荐用户10个电影

In [32]:
alsModel.recommendProducts(196,10)

[Rating(user=196, product=1185, rating=9.417061526508956),
 Rating(user=196, product=1311, rating=9.016676700102195),
 Rating(user=196, product=816, rating=8.83813082631952),
 Rating(user=196, product=543, rating=7.974469014315799),
 Rating(user=196, product=1280, rating=7.761011476194967),
 Rating(user=196, product=838, rating=7.656773616855159),
 Rating(user=196, product=1540, rating=7.354337688399311),
 Rating(user=196, product=1062, rating=7.123444979002269),
 Rating(user=196, product=451, rating=7.107345796576136),
 Rating(user=196, product=632, rating=6.980526582874688)]

 ### 为电影推荐10个用户


In [33]:
rmdUsers = alsModel.recommendUsers(242,10)

In [34]:
rmdUsers

[Rating(user=316, product=242, rating=6.047563993365451),
 Rating(user=278, product=242, rating=5.987279922615861),
 Rating(user=917, product=242, rating=5.80848228533929),
 Rating(user=861, product=242, rating=5.733157484681389),
 Rating(user=928, product=242, rating=5.717461606644183),
 Rating(user=79, product=242, rating=5.707868825712748),
 Rating(user=310, product=242, rating=5.65363877918395),
 Rating(user=174, product=242, rating=5.638155996221412),
 Rating(user=879, product=242, rating=5.6082500619073015),
 Rating(user=859, product=242, rating=5.607247615789483)]

### 找到最佳模型

In [23]:
"""
    如何找到最佳模型？
    -模型评估？
    计算Rmse
    -b 模型优化
     -i 数据
     -ii 超参数，找到最有模型
     交叉验证
     k-folds
"""

print()




###  ALS 模型评估 类似于回归算法的评分指标

In [35]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import DenseVector


In [37]:
# 针对测试数据集的预测

predict_rdd = alsModel.predictAll(test_rating.map(lambda r:(r[0],r[1])))

In [38]:
predict_rdd.take(5)

[Rating(user=138, product=147, rating=3.2054993354563885),
 Rating(user=303, product=147, rating=3.115569319724756),
 Rating(user=291, product=147, rating=3.893184151324582),
 Rating(user=543, product=147, rating=3.5730742535959985),
 Rating(user=621, product=147, rating=4.156794691637347)]

In [40]:
predict_actual = test_rating.map(lambda r:((r[0],r[1]),r[2])).join(predict_rdd.map(lambda r:((r[0],r[1]),r[2])))
                                                        
        

In [41]:
predict_actual.take(5)

[((196, 242), (3.0, 4.522511606950817)),
 ((157, 274), (4.0, 3.053131513650943)),
 ((160, 174), (5.0, 4.080812895491672)),
 ((298, 317), (4.0, 3.519803787242259)),
 ((293, 471), (3.0, 2.625111266705221))]

In [42]:
metrics = RegressionMetrics(predict_actual.map(lambda pr:pr[1]))
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)


MSE = 1.225593707817282
RMSE = 1.1070653584216616


## ALS超参数调整

In [44]:
# 定义一个
def alsModelEvaluate(model,testing_rdd):
    # 对测试数据集预测评分
    predict_rdd = model.predictAll(testing_rdd.map(lambda r:(r[0],r[1])))
    # yuce
    predict_actual = testing_rdd.map(lambda r:((r[0],r[1]),r[2])).join(predict_rdd.map(lambda r:((r[0],r[1]),r[2])))
    metrics = RegressionMetrics(predict_actual.map(lambda pr:pr[1]))
    print("RMSE = %s" % metrics.rootMeanSquaredError)
    # 返回均方差误差
    return metrics.rootMeanSquaredError

In [45]:
def train_model_evaluate(training_rdd,testing_rdd,rank,iterations,lambda_):
    # 使用 超参数的值，训练模型和ALS算法，训练模型
    
    model = ALS.train(training_rdd,rank,iterations,lambda_)
    rmse_value = alsModelEvaluate(model,testing_rdd)
    return (model,rmse_value,rank,iterations,lambda_)

In [47]:
# 三重for循环
mertic_list = [   train_model_evaluate(training_rating,test_rating,param_rank,param_iteration,param_lambda)
    for param_rank in [10,20]
    for param_iteration in [10,20]
    for param_lambda in [0.001,0.01,0.1]
    
]

RMSE = 1.2895417477562912
RMSE = 1.0882235322451779
RMSE = 0.9221961995824908
RMSE = 1.3255015463862032
RMSE = 1.1029445026972444
RMSE = 0.9234782229535952
RMSE = 1.4889757493960436
RMSE = 1.1841195192060103
RMSE = 0.9253320419403732
RMSE = 1.5808020761640527
RMSE = 1.205907168633097
RMSE = 0.9226740745335592


In [57]:
sortedlist = sorted(mertic_list,key=lambda k:k[1],reverse=False)

In [63]:
model,rmse_value,rank,iterations,lambda_ = sortedlist[0]
sortedlist[0]

(<pyspark.mllib.recommendation.MatrixFactorizationModel at 0x7fd8407b8518>,
 0.9221961995824908,
 10,
 10,
 0.1)

### 模型保存

In [61]:
best_model = model 
best_model.save(sc,'als-model')

### 模型加载

In [62]:
load_model = MatrixFactorizationModel.load(sc,'als-model')