In [40]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import helper as h
import pymongo

from pyspark import SparkContext

from pyspark.sql import Row
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

In [2]:
top_mv = 10000
client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
db = client['movie_dataset']
top_movies = db['top_movies'].find({'$limit': top_mv})
top_movies = [ m for m in top_movies]

In [4]:
sc = SparkContext("local", "spark session ratings")
spark = (SparkSession.builder
            .master("local")
            .appName("spark session ratings")
            .enableHiveSupport()
            .getOrCreate()
        )

In [10]:
lines = (sc.textFile("ratings.csv")
         .filter(lambda s: not s.startswith("userId")) # to ignore header
        )
parts = lines.map(lambda row: row.split(","))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), 
                                     movieId=int(p[1]),
                                     rating=float(p[2])/5, # => [0,1]
                                     timestamp=int(p[3])
                                    )
                      )#.filter(lambda r: r.movieId in top_movies)
ratings = spark.createDataFrame(ratingsRDD)

In [11]:
(training, test) = ratings.randomSplit([0.8, 0.2], 42)

In [12]:
TRAIN = not False

In [13]:
if TRAIN:
    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)
    model.write().overwrite().save("als.model")
    #https://stackoverflow.com/a/53931249

    #SPARK CORROMPE IL CSV -> se hadoop non sta girando. ha un po' di senso ma non troppo
else:
    model = ALSModel.load("als.model")

In [14]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test).na.fill(0)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.16459797123073772


### how to export/use the model?
    model.itemFactors.collect()
    model.userFactors.collect()

In [43]:
user_id = 249376
predictions.filter(predictions.userId==user_id).sort(F.col("prediction")).first()

Row(movieId=148, rating=0.7, timestamp=1247893752, userId=249376, prediction=0.5018646121025085)