### Check Spark Environment

In [1]:
spark

In [2]:
sqlContext

<pyspark.sql.context.SQLContext at 0x7fb68ac71898>

In [3]:
if 'sc' not in locals():
    from pyspark.context import SparkContext
    from pyspark.sql.context import SQLContext
    from pyspark.sql.session import SparkSession
    
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    spark = SparkSession(sc)

### Load Data

In [51]:
movies = spark.read.load('./data/ml-small/movies.csv',format='csv',header = True)
ratings = spark.read.load('./data/ml-small/ratings.csv',format='csv',header = True,inferSchema=True)

In [5]:
movies.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [7]:
ratings = ratings.select('userId','movieId','rating')
ratings.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
+------+-------+------+
only showing top 5 rows



### Feature Extraction

In [8]:
ratings = ratings.withColumn('userId',ratings['userId'].cast('int'))
ratings = ratings.withColumn('movieId',ratings['movieId'].cast('int'))
ratings = ratings.withColumn('rating',ratings['rating'].cast('float'))
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



### Train Model

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [10]:
pred = model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')
print('RMSE:',evaluator.evaluate(pred))

RMSE: 1.078033413416398


In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

(train,test) = ratings.randomSplit([0.8,0.2])

als2 = ALS(userCol='userId',itemCol='movieId', ratingCol='rating',coldStartStrategy='drop', nonnegative=True)
param_grid = ParamGridBuilder()\
            .addGrid(als.rank,[4,8,12])\
            .addGrid(als.regParam,[0.1, 0.15, 0.2])\
            .build()

evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')

tvs = TrainValidationSplit(estimator=als2,estimatorParamMaps=param_grid,evaluator=evaluator)

In [16]:
model = tvs.fit(train)

In [17]:
best_model = model.bestModel
pred = best_model.transform(test)
print('RMSE:',evaluator.evaluate(pred))

RMSE: 0.8801732351850021


In [42]:
recs = best_model.recommendForAllUsers(10)

In [74]:
import pandas as pd

def display_recommendation(recs,userId):
    user_recs = recs.filter(recs.userId == userId)
    user_recs = user_recs.select("recommendations.movieId",'recommendations.rating')
    movies_ = user_recs.select("movieId").toPandas().iloc[0,0]
    ratings_ = user_recs.select("rating").toPandas().iloc[0,0]
    rating_mat = pd.DataFrame(movies_,columns=['movieId'])    
    for i in range(0, len(movies_)): 
        movies_[i] = str(movies_[i])
    disp_df = movies.filter(movies.movieId.isin(movies_)).toPandas()
    disp_df['ratings'] = ratings_
    return disp_df
#     rating_mat_ps = sqlContext.createDataFrame(rating_mat)
#     return ratings_mat_ps  

In [77]:
display_recommendation(recs,165).head(10)

Unnamed: 0,movieId,title,genres,ratings
0,3379,On the Beach (1959),Drama,4.825528
1,26528,Anne of Green Gables (1985),Children|Drama,4.725781
2,27156,Neon Genesis Evangelion: The End of Evangelion...,Action|Animation|Drama|Fantasy|Sci-Fi,4.601559
3,27611,Battlestar Galactica (2003),Drama|Sci-Fi|War,4.541858
4,33649,Saving Face (2004),Comedy|Drama|Romance,4.505105
5,69524,Raiders of the Lost Ark: The Adaptation (1989),Action|Adventure|Thriller,4.498843
6,74282,Anne of Green Gables: The Sequel (a.k.a. Anne ...,Children|Drama|Romance,4.473551
7,93988,North & South (2004),Drama|Romance,4.433724
8,171495,Cosmos,(no genres listed),4.429231
9,183897,Isle of Dogs (2018),Animation|Comedy,4.427635
