# PySpark MovieLens Recommendation by ALS

In [1]:
import urllib
import zipfile

import pandas as pd
import pyspark.sql.functions as F
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.sql import DataFrame as SDF
from pyspark.sql import Row, SparkSession

## Utility関数定義

In [2]:
def download(url: str, dist: str) -> None:
    b = urllib.request.urlopen(url).read()
    with open(dist, 'wb') as f:
        f.write(b)


def extract(src: str, dist: str) -> None:
    with zipfile.ZipFile(src) as z:
        z.extractall(dist)

In [3]:
def parse_ratings(path: str) -> SDF:
    lines = spark.read.text(path).rdd
    parts = lines.map(lambda row: row.value.split("::"))
    rdd = (
        parts
        .map(
            lambda p: Row(
                userId=int(p[0]),
                movieId=int(p[1]),
                rating=float(p[2]),
                timestamp=int(p[3])))
    )
    return spark.createDataFrame(rdd)


def parse_movies(path: str) -> SDF:
    lines = spark.read.text(path).rdd
    parts = lines.map(lambda row: row.value.split("::"))
    rdd = (
        parts
        .map(
            lambda p: Row(
                movieId=int(p[0]),
                title=str(p[1]),
                genre=str(p[2])))
    )
    return spark.createDataFrame(rdd)


def pd_parse_ratings(path: str) -> SDF:
    return spark.createDataFrame(
        pd.read_csv(
            path,
            engine='python',
            sep='::',
            header=None,
            names=['userId', 'movieId', 'rating', 'timestamp']))


def pd_parse_movies(path: str) -> SDF:
    return spark.createDataFrame(
        pd.read_csv(
            path,
            engine='python',
            sep='::',
            header=None,
            names=['movieId', 'title', 'genre']))

In [4]:
spark = (
    SparkSession
    .builder
    .master('k8s://https://kubernetes.default.svc.cluster.local:443')
    .appName('spark_on_k8s')
    .config('spark.kubernetes.container.image', 'kanchishimono/pyspark-worker:latest')
    .config('spark.kubernetes.pyspark.pythonVersion', 3)
    .config('spark.executor.instances', 2)
    .config('spark.kubernetes.namespace', 'notebook')
    .config('spark.port.maxRetries', 3)
    .config('spark.history.ui.port', True)
    .config('spark.ui.enabled', True)
    .config('spark.ui.port', 4040)
    .config('spark.driver.host', 'ofdg4da.notebook.svc.cluster.local')
    .config('spark.driver.port', 29413)
    .config('spark.executor.memory', '3G')
    .config('spark.executor.cores', 1)
    .config('spark.default.parallelism', 10)
    .config('spark.sql.shuffle.partitions', 10)
    .config('spark.eventLog.compress', True)
    .config('spark.eventLog.enabled', True)
    .config('spark.eventLog.dir', 'file:///tmp/spark-events')
    .getOrCreate()
)


In [5]:
spark

## MovieLensデータセット
### ダウンロード

In [6]:
URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
download(URL, '/home/work/ml-1m.zip')
extract('/home/work/ml-1m.zip', '/home/work/')

### 読み込み

In [7]:
ratings_df = pd_parse_ratings('/home/work/ml-1m/ratings.dat').repartition(10)
movies_df = pd_parse_movies('/home/work/ml-1m/movies.dat').repartition(10)

In [8]:
ratings_df.toPandas()

Unnamed: 0,userId,movieId,rating,timestamp
0,366,1952,4,976310072
1,369,3614,2,976310418
2,274,1288,5,976594612
3,293,46,3,979676160
4,589,2918,4,975919291
...,...,...,...,...
1000204,5734,595,4,980888801
1000205,5505,3578,5,959732710
1000206,5496,477,4,959809817
1000207,5576,2088,4,959291916


In [9]:
movies_df.toPandas()

Unnamed: 0,movieId,title,genre
0,385,"Man of No Importance, A (1994)",Drama
1,67,Two Bits (1995),Drama
2,225,Disclosure (1994),Drama|Thriller
3,328,Tales From the Crypt Presents: Demon Knight (1...,Horror
4,113,Before and After (1996),Drama|Mystery
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3775,Make Mine Music (1946),Animation|Children's|Musical
3880,3693,"Toxic Avenger, The (1985)",Comedy|Horror
3881,3850,Whatever Happened to Aunt Alice? (1969),Crime|Thriller


In [10]:
train_df, test_df = ratings_df.randomSplit([0.6, 0.4], seed=12345)
train_df.persist()
movies_df.persist()

DataFrame[movieId: bigint, title: string, genre: string]

## レコメンドモデル (ALS) 定義
### パラメーター

|パラメータ名|説明|
|:--|:--|
|userCol|ユーザーIDが記録されているカラム名|
|itemCol|アイテムIDが記録されているカラム名|
|ratingCol|ユーザーのアイテムに対する評価値が記録されているカラム名。レビュー値のように明示的なものと、アクセス回数など暗黙的なものどちらかを使用する。|
|coldStartStrategy|訓練データに含まれない未知のユーザーやアイテムの取り扱い方法。'nan'は未知のIDに対する推論値をnanで返す。'drop'は未知のIDが含まれる行を落とす。|
|numUserBlocks|ユーザーの潜在因子行列のDataFrameパーティション数|
|numItemBlocks|アイテムの潜在因子行列のDataFrameパーティション数|
|implicitPrefs|ratingColに暗黙的な値を使用するか。Trueの場合、暗黙的評価値用の計算式が内部でしようされる。|
|nonnegative|ratingColに含まれる値が非負値か。|
|maxIter|収束計算の繰り返し回数|
|rank|ユーザー、アイテムの潜在因子行列の次元数|
|alpha|implicitPrefs=Trueの時のみ有効。暗黙的評価値の信頼度の高さを示す値。|
|regParam|正則化パラメータ|


In [11]:
als = ALS(
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop',
    numUserBlocks=2,
    numItemBlocks=2,
    implicitPrefs=False,
    nonnegative=True)

## レコメンドモデル計算
### グリッドサーチパラメータ定義

In [12]:
param_grid = (
    ParamGridBuilder()
    .addGrid(als.maxIter, [5, 10, 15])
    .addGrid(als.rank, [10, 15, 20])
    # alpha enabled only when implicitPrefs is True
    # .addGrid(als.alpha, [1.0, 10.0, 100.0])
    .addGrid(als.regParam, [0.01, 0.05, 0.1, 0.5, 1.0])
    .build()
)

### 評価指標選択 (RMSE)

In [13]:
evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction')

### グリッドサーチ実行

In [14]:
tsv = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=0.8,
    collectSubModels=True)

In [15]:
models = tsv.fit(train_df)

In [16]:
best_model = models.bestModel

## モデル評価

In [17]:
prediction = best_model.transform(test_df)
rmse = evaluator.evaluate(prediction)
print('RMSE: {:.3f}'.format(rmse))

RMSE: 0.872


In [18]:
metric_param_sets = [(m, p) for m, p in zip(models.validationMetrics, param_grid)]

## レコメンド結果確認
### 確認対象ユーザー選択

In [19]:
users = (
    ratings_df
    .select('userId')
    .distinct()
    .filter(F.col('userId') == 55)
)

In [20]:
users.toPandas()

Unnamed: 0,userId
0,55


### 確認対象ユーザー評価履歴確認

In [21]:
(
    ratings_df
    .join(users, ['userId'], 'inner')
    .join(movies_df, ['movieId'], 'inner')
    .orderBy('userId', F.desc('rating'))
    .toPandas()
)

Unnamed: 0,movieId,userId,rating,timestamp,title,genre
0,110,55,5,977943155,Braveheart (1995),Action|Drama|War
1,318,55,5,977942882,"Shawshank Redemption, The (1994)",Drama
2,356,55,5,977948435,Forrest Gump (1994),Comedy|Romance|War
3,589,55,5,977948346,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
4,3114,55,5,977943112,Toy Story 2 (1999),Animation|Children's|Comedy
5,2761,55,5,977948346,"Iron Giant, The (1999)",Animation|Children's
6,527,55,5,977942911,Schindler's List (1993),Drama|War
7,1704,55,5,977943112,Good Will Hunting (1997),Drama
8,457,55,5,977948394,"Fugitive, The (1993)",Action|Thriller
9,2762,55,5,977943181,"Sixth Sense, The (1999)",Thriller


### レコメンド結果確認

In [23]:
(
    best_model
    .recommendForUserSubset(users, 10)
    .withColumn('temp', F.explode('recommendations'))
    .select(
        'userId',
        F.col('temp').getItem('movieId').alias('movieId'),
        F.col('temp').getItem('rating').alias('rating')
    )
    .join(movies_df, ['movieId'], 'inner')
    .orderBy(F.desc('rating'))
    .show(truncate=False)
)

+-------+------+---------+-----------------------------------------+-------------------------------+
|movieId|userId|rating   |title                                    |genre                          |
+-------+------+---------+-----------------------------------------+-------------------------------+
|572    |55    |5.606524 |Foreign Student (1994)                   |Drama                          |
|260    |55    |4.8981614|Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Fantasy|Sci-Fi|
|1198   |55    |4.8812084|Raiders of the Lost Ark (1981)           |Action|Adventure               |
|110    |55    |4.863939 |Braveheart (1995)                        |Action|Drama|War               |
|2834   |55    |4.8356743|Very Thought of You, The (1998)          |Comedy|Romance                 |
|2762   |55    |4.809533 |Sixth Sense, The (1999)                  |Thriller                       |
|318    |55    |4.7950053|Shawshank Redemption, The (1994)         |Drama                  

In [24]:
spark.stop()