# Pre-computing

## Necessary Imports

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
! ls -la
! ls -la ./spark-warehouse

total 3289952
drwxrwxr-x 8 big big       4096 mai 24 01:44  .
drwxrwxr-x 5 big big       4096 mai 24 01:43  ..
drwxr-xr-x 5 big big       4096 mai 18 19:56  ALSmodel
-rwxrw-rw- 1 big big 3366981927 mai 16 21:13  amazon_item_ratings.csv
drwxr-xr-x 2 big big       4096 mai 23 19:51  data.parquet
-rwxrw-rw- 1 big big      43496 mai 23 19:55 'Data Preparation.ipynb'
-rw-rw-r-- 1 big big    1799324 mai 24 01:44  derby.log
drwxrwxr-x 2 big big       4096 mai 23 19:45  .ipynb_checkpoints
drwxr-xr-x 2 big big      12288 mai 23 00:39  items.parquet
drwxrwxr-x 5 big big       4096 mai 24 01:44  metastore_db
-rw-rw-r-- 1 big big      17007 mai 24 01:44  Pre-Computing.ipynb
-rw-rw-r-- 1 big big       9347 mai 23 04:52  Results.ipynb
drwxr-xr-x 6 big big       4096 mai 23 00:39  spark-warehouse
total 120
drwxr-xr-x 6 big big  4096 mai 23 00:39 .
drwxrwxr-x 8 big big  4096 mai 24 01:44 ..
drwxr-xr-x 2 big big 12288 mai 23 00:39 datatable
drwxr-xr-x 2 big big 45056 mai 18 20:03 itemsrecommendationtab

In [4]:
print(spark.catalog.listDatabases())

[Database(name='default', description='Default Hive database', locationUri='file:/home/big/Desktop/Aulas/Projeto/spark-warehouse')]


## Load Table to Dataframe

In [5]:
df_data = spark.sql("SELECT * FROM datatable")

Another way to load

In [6]:
#df_data = spark.read.load("spark-warehouse/datatable")
#df_data.show()

## Split dataset

In [8]:
dftrain, dftest = df_data.randomSplit([0.8,0.2], 42)

dftrain.cache()

[dftrain.count(),dftest.count(),df_data.count()]

[66716, 16410, 83126]

In [9]:
dftrain.printSchema()

root
 |-- Reviewer: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- ReviewerID: double (nullable = true)
 |-- ItemID: double (nullable = true)



## Create or load an existing Model

In [25]:
def getALSModel():
    als = ALS(maxIter=5,
          regParam=0.01,
          userCol="ReviewerID",
          itemCol="ItemID",
          ratingCol="Rating",
          coldStartStrategy="drop")

    return als.fit(dftrain),als

In [26]:
try:
  # Load ALS Model if it exists
    modelpath = "ALSmodel"
    model = ALSModel.load(modelpath)
except:
    print("No model exists\nCreating Model...")
    model,als = getALSModel()
    print("Model created!")
    print("Saving Model...!")
    modelpath = "ALSmodel"
    model.write().overwrite().save(modelpath)
    print("Model Saved!")
else:
    print("Model Found and Loaded!")

No model exists
Creating Model...
Model created!
Saving Model...!
Model Saved!


In [19]:
dftrain.printSchema()

root
 |-- Reviewer: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- ReviewerID: double (nullable = true)
 |-- ItemID: double (nullable = true)



## Evaluate Predictions

Get predictions and evaluate them using RegressionEvaluator with a RMSE metric. The closer to 0 the better

In [27]:
predictions = model.transform(dftest)
evaluator = RegressionEvaluator(metricName="rmse",
                               labelCol="Rating",
                               predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square-error = " + str(rmse))

Root-mean-square-error = 4.756455341859426


In [28]:
! ls -la ./ALSmodel

total 20
drwxr-xr-x 5 big big 4096 mai 24 01:54 .
drwxrwxr-x 8 big big 4096 mai 24 01:54 ..
drwxr-xr-x 2 big big 4096 mai 24 01:54 itemFactors
drwxr-xr-x 2 big big 4096 mai 24 01:54 metadata
drwxr-xr-x 2 big big 4096 mai 24 01:54 userFactors


In [29]:
users = df_data.select(als.getUserCol()).distinct()
items = df_data.select(als.getItemCol()).distinct()

In [30]:
users.show()
items.show()
[users.count(),items.count()]

+----------+
|ReviewerID|
+----------+
|   13918.0|
|   13607.0|
|   50815.0|
|   10681.0|
|   55682.0|
|   76173.0|
|   30931.0|
|   11757.0|
|   39221.0|
|   59269.0|
|   45583.0|
|   22274.0|
|   30867.0|
|   67987.0|
|   55288.0|
|   67557.0|
|   36870.0|
|   16561.0|
|   31666.0|
|   36797.0|
+----------+
only showing top 20 rows

+-------+
| ItemID|
+-------+
|67866.0|
|67934.0|
|67987.0|
|68112.0|
|68215.0|
|68245.0|
|68353.0|
|68581.0|
|68656.0|
|68733.0|
|68790.0|
|69030.0|
|69071.0|
|69101.0|
|69133.0|
|69457.0|
|69860.0|
|69870.0|
|69959.0|
|69989.0|
+-------+
only showing top 20 rows



[81170, 73716]

## Compute top 5 recommendations for all Users and all Items

Distinct users and items selected above could be used to calculate recommendations for a subset since this is a long process

In [31]:
items_to_recommend = 5
users_recommendations = model.recommendForAllUsers(items_to_recommend)
#users_recommendations = model.recommendForUserSubset(itemSubset,items_to_recommend)

users_to_recommend = 5
items_recommendations = model.recommendForAllItems(users_to_recommend)
#items_recommendations = model.recommendForItemSubset(userSubset,users_to_recommend)


In [32]:
users_recommendations.show()
items_recommendations.show()

+----------+--------------------+
|ReviewerID|     recommendations|
+----------+--------------------+
|       148|[[325, 10.866751]...|
|       463|[[22, 8.955448], ...|
|       496|[[155, 8.835874],...|
|       833|[[38, 8.195966], ...|
|      1088|[[50, 8.263234], ...|
|      1238|[[34, 8.558122], ...|
|      1342|[[177, 7.800251],...|
|      1580|[[178, 8.667134],...|
|      1591|[[182, 8.4626045]...|
|      1645|[[167, 9.281074],...|
|      1829|[[678, 9.029999],...|
|      1959|[[170, 8.941368],...|
|      2142|[[119, 8.957814],...|
|      2366|[[234, 8.134719],...|
|      2659|[[921, 7.755391],...|
|      2866|[[95, 0.9999194],...|
|      3175|[[46, 8.425614], ...|
|      3794|[[177, 8.767233],...|
|      3997|[[178, 6.6124516]...|
|      4101|[[58, 8.519209], ...|
+----------+--------------------+
only showing top 20 rows

+------+--------------------+
|ItemID|     recommendations|
+------+--------------------+
|   148|[[65011, 12.75674...|
|   463|[[29283, 7.788496...|
|   471|

## Save User and Items recommendations as persistent tables

In [33]:
users_recommendations.write.mode("overwrite").saveAsTable("UserRecommendationTable")
items_recommendations.write.mode("overwrite").saveAsTable("ItemsRecommendationTable")