In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from collections import defaultdict


In [2]:

pd_ratings= pd.read_csv('../data/movies/ratings.csv')
pd_ratings=pd_ratings.drop('timestamp', axis=1)


In [3]:
spark = SparkSession.builder.getOrCreate()
spark_ratings= spark.createDataFrame(pd_ratings) 
train, test = spark_ratings.randomSplit([0.8, 0.2], seed=42)

In [69]:
maxIter_list = np.linspace(10,100, num=10)
maxIter_list

array([ 10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.])

In [70]:
from collections import defaultdict
rmse_dict= defaultdict(lambda: 'No_value')

rank_list = np.linspace(1,20, num=20)
maxIter_list = np.linspace(10,100, num=10)
regParam_list=np.linspace(.05,1, num=20)
numblock_list= np.linspace(5, 50, num=10)
seed = 42
cold_list='drop'
checkpoint_list=[10,20,50,100,1000, 10000]

ALS(self, rank=10, maxIter=10,
    regParam=0.1, numUserBlocks=10, 
    numItemBlocks=10, 
    implicitPrefs=false, 
    alpha=1.0,
    userCol="user", 
    itemCol="item",
    seed=None, ratingCol="rating",
    nonnegative=false,
    checkpointInterval=10, 
    intermediateStorageLevel="MEMORY_AND_DISK",
    finalStorageLevel="MEMORY_AND_DISK", 
    coldStartStrategy="nan")

In [None]:
for rank in rank_list:
    for maxIter in maxIter_list:
        for regParam in regParam_list:
            for numblock in numblock_list:
                for checkpoint in checkpoint_list:
                    factor_model = ALS(itemCol='movieId',userCol='userId',ratingCol='rating',nonnegative=True,coldStartStrategy='drop',
                                       rank=rank, maxIter=maxIter, numUserBlocks=numblock, numItemBlocks=numblock,
                                       checkpointInterval=checkpoint,regParam= regParam)
                    ratings=factor_model.fit(train)
                    predict=ratings.transform(test)
                    rmse = evaluator.evaluate(predict)
                    print(rmse)
                    rmse_dict[rmse]=f'rank={rank},maxIter={maxIter},RegParam={regParam}, rank={rank},numblocks={numblock},interval={checkpoint}'

0.902558768118035
0.902558768118035
0.902558768118035
0.9025587681180351
0.9025587681180351
0.902558768118035
0.9034250492781775
0.9034250492781775
0.9034250492781774
0.9034250492781774
0.9034250492781775
0.9034250492781774
0.9030386539953867
0.9030386539953867
0.9030386539953867
0.9030386539953867
0.9030386539953869
0.9030386539953867
0.9041627433466949
0.9041627433466949
0.904162743346695
0.9041627433466949
0.904162743346695
0.9041627433466949
0.9039977163254037
0.9039977163254037
0.9039977163254037
0.9039977163254037
0.9039977163254037
0.9039977163254037
0.9031992186810075
0.9031992186810075
0.9031992186810075
0.9031992186810075
0.9031992186810075
0.9031992186810075
0.9038830387170398
0.9038830387170398
0.9038830387170398
0.9038830387170398
0.9038830387170399
0.9038830387170398
0.9038492262707636
0.9038492262707636
0.9038492262707636
0.9038492262707636
0.9038492262707635
0.9038492262707636
0.9034117625818329
0.903411762581833
0.9034117625818329
0.9034117625818329
0.903411762581833
0

In [56]:
factor_model = ALS(itemCol='movieId',userCol='userId',ratingCol='rating',nonnegative=True,
                                       rank=20, maxIter=11, numUserBlocks=15, numItemBlocks=15,
                                       checkpointInterval=20,regParam= .2)
ratings=factor_model.fit(train)
predict=ratings.transform(test)
rmse = evaluator.evaluate(predict)
rmse_dict[rmse]=ratings.extractParamMap()

In [53]:
ratings=factor_model.fit(train)
predict=ratings.transform(test)

In [54]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predict)
print(rmse)
print(ratings.extractParamMap())

0.9030821184249294
{Param(parent='ALS_e30d4a146a06', name='coldStartStrategy', doc='strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: nan,drop.'): 'drop', Param(parent='ALS_e30d4a146a06', name='itemCol', doc='column name for item ids. Ids must be within the integer value range.'): 'movieId', Param(parent='ALS_e30d4a146a06', name='predictionCol', doc='prediction column name'): 'prediction', Param(parent='ALS_e30d4a146a06', name='userCol', doc='column name for user ids. Ids must be within the integer value range.'): 'userId'}


In [None]:
{Param(parent='ALS_e30d4a146a06', name='coldStartStrategy', 
       doc='strategy for dealing with unknown or new users/items at prediction time. 
       This may be useful in cross-validation or production scenarios, for handling user/item ids 
       the model has not seen in the training data. Supported values: nan,drop.'): 'drop', 
       Param(parent='ALS_e30d4a146a06', name='itemCol', doc='column name for item ids. 
             Ids must be within the integer value range.'): 'movieId',
             Param(parent='ALS_e30d4a146a06', name='predictionCol', doc='prediction column name'):
             'prediction', 
             Param(parent='ALS_e30d4a146a06', name='userCol', 
                   doc='column name for user ids. Ids must be within the integer value range.'): 
 'userId'}