In [1]:
import pyspark
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('exam').getOrCreate()



In [3]:
path='/home/lplab/Desktop/ratings.csv'
df=spark.read.csv(path,header=True,inferSchema=True)
df.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      1|   4.0|1225734739|
|     1|    110|   4.0|1225865086|
|     1|    158|   4.0|1225733503|
|     1|    260|   4.5|1225735204|
|     1|    356|   5.0|1225735119|
+------+-------+------+----------+
only showing top 5 rows



### convert to date time and extract year

In [4]:
from pyspark.sql.functions import from_unixtime, year
df_datetime = df.withColumn("datetime", from_unixtime("timestamp"))
df_with_year = df_datetime.withColumn("year", year("datetime"))
df_with_year.show()

+------+-------+------+----------+-------------------+----+
|userId|movieId|rating| timestamp|           datetime|year|
+------+-------+------+----------+-------------------+----+
|     1|      1|   4.0|1225734739|2008-11-03 23:22:19|2008|
|     1|    110|   4.0|1225865086|2008-11-05 11:34:46|2008|
|     1|    158|   4.0|1225733503|2008-11-03 23:01:43|2008|
|     1|    260|   4.5|1225735204|2008-11-03 23:30:04|2008|
|     1|    356|   5.0|1225735119|2008-11-03 23:28:39|2008|
|     1|    381|   3.5|1225734105|2008-11-03 23:11:45|2008|
|     1|    596|   4.0|1225733524|2008-11-03 23:02:04|2008|
|     1|   1036|   5.0|1225735626|2008-11-03 23:37:06|2008|
|     1|   1049|   3.0|1225734079|2008-11-03 23:11:19|2008|
|     1|   1066|   4.0|1225736961|2008-11-03 23:59:21|2008|
|     1|   1196|   3.5|1225735441|2008-11-03 23:34:01|2008|
|     1|   1200|   3.5|1225735861|2008-11-03 23:41:01|2008|
|     1|   1210|   4.5|1225735210|2008-11-03 23:30:10|2008|
|     1|   1214|   4.0|1225736426|2008-1

In [5]:
df.cache()

DataFrame[userId: int, movieId: int, rating: double, timestamp: int]

In [6]:
data=df.select("userId","movieId","rating")
data.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|    110|   4.0|
|     1|    158|   4.0|
|     1|    260|   4.5|
|     1|    356|   5.0|
+------+-------+------+
only showing top 5 rows



### APPLYING ALS ALGORITHM

In [7]:
from pyspark.ml.recommendation import ALS
train,test=data.randomSplit([0.8,0.2])
als=ALS(
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating')

In [8]:
model=als.fit(train)

In [9]:
predictions=model.transform(test)
predictions.show(5)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 14532|   1379|   1.0| 3.0757012|
| 14532|   1422|   2.0| 3.0128603|
| 14532|   1552|   2.0| 3.3113978|
| 14532|   1619|   3.0| 3.3606613|
| 14532|   1653|   3.0| 3.7480521|
+------+-------+------+----------+
only showing top 5 rows



In [11]:
ratingList = predictions.select("rating")
predictionList = predictions.select("prediction")

In [14]:
ratingslist = ratingList.select("rating").rdd.flatMap(lambda x: x).collect()

In [16]:
predictionslist = predictionList.select("prediction").rdd.flatMap(lambda x: x).collect()

In [21]:
ratingslist

[2.0,
 1.0,
 3.0,
 3.0,
 3.0,
 2.0,
 3.0,
 1.0,
 1.0,
 3.0,
 2.0,
 4.0,
 2.0,
 1.0,
 1.0,
 3.0,
 4.0,
 4.0,
 4.0,
 3.0,
 1.0,
 1.0,
 3.0,
 1.0,
 1.0,
 1.0,
 1.0,
 4.0,
 4.0,
 3.0,
 2.0,
 4.0,
 3.0,
 4.0,
 1.0,
 5.0,
 3.0,
 3.0,
 3.0,
 2.0,
 4.0,
 1.0,
 3.0,
 2.0,
 3.0,
 1.0,
 3.0,
 4.0,
 1.0,
 3.0,
 3.0,
 1.0,
 1.0,
 3.0,
 2.0,
 3.0,
 3.0,
 1.0,
 1.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0,
 1.0,
 3.0,
 1.0,
 2.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 2.0,
 3.0,
 4.0,
 3.0,
 3.0,
 4.0,
 3.0,
 1.0,
 4.0,
 2.0,
 4.0,
 2.0,
 1.0,
 3.0,
 4.0,
 1.0,
 1.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 2.0,
 2.0,
 5.0,
 3.5,
 5.0,
 5.0,
 4.5,
 3.5,
 4.0,
 5.0,
 5.0,
 3.5,
 2.0,
 4.0,
 4.0,
 3.5,
 4.0,
 4.0,
 4.0,
 4.0,
 5.0,
 5.0,
 3.5,
 5.0,
 3.0,
 5.0,
 4.5,
 3.0,
 4.0,
 3.0,
 4.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 4.0,
 3.0,
 4.0,
 2.5,
 5.0,
 4.0,
 4.0,
 4.0,
 4.5,
 3.5,
 4.0,
 3.5,
 4.0,
 4.5,
 5.0,
 3.0,
 4.5,
 4.0,
 4.5,
 3.5,
 2.0,
 4.0,
 4.5,
 1.0,
 3.5,
 3.5,
 3.5,
 2.0,
 1.0,
 3.0,
 3.0

In [23]:
predictionslist

[3.4911749362945557,
 2.50203800201416,
 3.591320753097534,
 3.892322063446045,
 3.75050950050354,
 3.6040470600128174,
 3.36264967918396,
 3.226267099380493,
 2.5177817344665527,
 3.4393904209136963,
 3.7773847579956055,
 3.1701362133026123,
 2.882908821105957,
 2.8734560012817383,
 3.292367458343506,
 2.522827386856079,
 2.766845226287842,
 3.2267556190490723,
 2.933884382247925,
 4.471673488616943,
 3.712298631668091,
 3.823511838912964,
 2.9237194061279297,
 2.9562432765960693,
 3.528773069381714,
 3.404426097869873,
 3.6355416774749756,
 1.2131178379058838,
 3.2749061584472656,
 3.4725489616394043,
 3.388288974761963,
 3.297170400619507,
 3.8988351821899414,
 4.123162746429443,
 3.701953172683716,
 4.055303573608398,
 1.9111037254333496,
 3.4597840309143066,
 3.245729446411133,
 3.473728895187378,
 4.165623664855957,
 4.223512649536133,
 4.294568061828613,
 3.465301752090454,
 3.448894739151001,
 4.192730903625488,
 2.9875094890594482,
 3.0465970039367676,
 3.8227059841156006,
 3.

In [25]:
predictionslist = [0 if math.isnan(value) else value for value in predictionslist]
ratingslist = [0 if math.isnan(value) else value for value in ratingslist]

In [26]:
import math
squared_errors = [(actual - predicted) ** 2 for actual, predicted in zip(ratingslist, predictionslist)]
mse = sum(squared_errors) / len(ratingslist)
rmse = math.sqrt(mse)
rmse

1.2621341934667303