In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7


In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

In [4]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [6]:
df = spark.read.csv("movielens_ratings.csv", header=True, inferSchema=True)
df.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
+-------+------+------+
only showing top 5 rows



In [8]:
users = df.select('userId').distinct().count()
movies = df.select('movieId').distinct().count()
count = df.count()
print(count, users, movies)

1501 30 100


In [9]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

als = ALS(maxIter=20,           # Number of iterations
          regParam=0.1,        # Regularization parameter beta
          rank=10,              # Number of features
          numItemBlocks=10,     # Number partitioned to parallelize computation
          alpha=0.01,            # Learning rate
          userCol='userId',     
          itemCol='movieId',
          ratingCol='rating')
model = als.fit(train_df)

In [32]:
import time
tic = time.time()
predictions = model.transform(test_df)
predictions.show(5)
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('RMSE: {:.4f}'.format(rmse))

evaluator = RegressionEvaluator(metricName='r2',
                                labelCol='rating',
                                predictionCol='prediction')
r2 = evaluator.evaluate(predictions)
print('R2: {:.4f}'.format(r2))
toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    26|0.38315165|
|     31|   1.0|     5| 1.2039762|
|     31|   1.0|     4| 1.8550174|
|     31|   2.0|    25|  2.952228|
|     31|   1.0|    18|0.33214805|
+-------+------+------+----------+
only showing top 5 rows

RMSE: 0.9701
R2: 0.2720
Total time: 9.96 seconds


In [26]:
import time
tic = time.time()

predictions = model.transform(test_df)
predictions.show(5)

from pyspark.mllib.evaluation import RegressionMetrics
prediction_and_target = predictions.select(['prediction', 'rating'])
metrics = RegressionMetrics(prediction_and_target.rdd)

print('RMSE: {:.4f}'.format(metrics.rootMeanSquaredError))
print('R2: {:.4f}'.format(metrics.r2))

toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    26|  0.1638145|
|     31|   1.0|     5|-0.40658382|
|     31|   1.0|     4|  1.4658239|
|     31|   2.0|    25|  1.1692389|
|     31|   1.0|    18|  2.1397219|
+-------+------+------+-----------+
only showing top 5 rows

RMSE: 1.3998
R2: -0.5159
Total time: 16.04 seconds


In [35]:
user_recom = model.recommendForAllUsers(10)
user_recom.show(5, False)

+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                         |
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28    |[[92, 4.241976], [81, 4.0281463], [89, 3.712594], [2, 3.5089815], [49, 3.420334], [40, 3.3806953], [4, 2.9267924], [82, 2.9190497], [23, 2.809626], [42, 2.7146363]]    |
|26    |[[22, 4.781753], [94, 4.6731215], [30, 4.568377], [23, 4.420328], [88, 4.313788], [32, 4.259039], [51, 4.242293], [7, 4.136209], [68, 3.8850794], [54, 3.694302]]       |
|27    |[[18, 3.1866446], [8, 2.8229787], [27, 2.6729748], [39, 2.664792], [34, 2.6246254], [83, 2.6103039], [

In [37]:
userId = 27
test_df.filter(test_df['userId'] == userId).sort('rating', ascending = False).show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|     80|   3.0|    27|
|     51|   3.0|    27|
|      0|   1.0|    27|
|     20|   1.0|    27|
|     92|   1.0|    27|
|     28|   1.0|    27|
+-------+------+------+



In [42]:
user_recom.filter(user_recom['userId'] == userId).show(truncate=False)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                       |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|27    |[[18, 3.1866446], [8, 2.8229787], [27, 2.6729748], [39, 2.664792], [34, 2.6246254], [83, 2.6103039], [19, 2.572713], [46, 2.5691895], [32, 2.529967], [66, 2.4631078]]|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [43]:
df = spark.createDataFrame([
                            (0, [1, 2, 5]),
                            (1, [1, 2, 3, 5]),
                            (2, [1, 2])
                            ], ['id', 'items'])
df.show()

+---+------------+
| id|       items|
+---+------------+
|  0|   [1, 2, 5]|
|  1|[1, 2, 3, 5]|
|  2|      [1, 2]|
+---+------------+



In [45]:
from pyspark.ml.fpm import FPGrowth

fpgrowth = FPGrowth(itemsCol='items', minSupport=0.5, minConfidence=0.6)
model = fpgrowth.fit(df)
predictions = model.transform(df)
predictions.show()

+---+------------+----------+
| id|       items|prediction|
+---+------------+----------+
|  0|   [1, 2, 5]|        []|
|  1|[1, 2, 3, 5]|        []|
|  2|      [1, 2]|       [5]|
+---+------------+----------+



In [46]:
model.freqItemsets.show()

+---------+----+
|    items|freq|
+---------+----+
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
|      [2]|   3|
|   [2, 1]|   3|
|      [1]|   3|
+---------+----+

