# Pre-computing

## Necessary Imports

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [3]:
MAX_MEMORY = "8g"

warehouse_location = 'spark-warehouse'

spark = SparkSession \
    .builder \
    .appName("Foo") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .enableHiveSupport() \
    .getOrCreate()

In [4]:
! ls -la
! ls -la ./spark-warehouse

total 679332
drwxrwxr-x 7 big big      4096 mai 18 19:16  .
drwxrwxr-x 3 big big      4096 mai 18 15:59  ..
drwxr-xr-x 5 big big      4096 mai 18 17:31  ALSmodel
-rwxrw-rw- 1 big big 689932433 mai 18 15:22  amazon_item_ratings.csv
-rw-rw-r-- 1 big big     35683 mai 18 18:55 'Data Preparation.ipynb'
-rw-rw-r-- 1 big big   5595908 mai 18 19:16  derby.log
drwxrwxr-x 2 big big      4096 mai 18 17:51  .ipynb_checkpoints
drwxr-xr-x 2 big big      4096 mai 18 17:31  items.parquet
drwxrwxr-x 5 big big      4096 mai 18 19:16  metastore_db
-rw-rw-r-- 1 big big     29695 mai 18 19:16  Pre-Computing.ipynb
drwxr-xr-x 5 big big      4096 mai 18 17:32  spark-warehouse
total 124
drwxr-xr-x 5 big big  4096 mai 18 17:32 .
drwxrwxr-x 7 big big  4096 mai 18 19:16 ..
drwxr-xr-x 2 big big 53248 mai 18 17:32 itemsrecommendationtable
drwxr-xr-x 2 big big  4096 mai 18 17:31 itemstable
drwxr-xr-x 2 big big 53248 mai 18 17:32 userrecommendationtable


In [5]:
print(spark.catalog.listDatabases())

[Database(name='default', description='Default Hive database', locationUri='file:/home/big/Desktop/Aulas/Projeto/spark-warehouse')]


## Load Table to Dataframe

In [9]:
df_data = spark.sql("SELECT * FROM datatable")

Another way to load

In [None]:
#df_data = spark.read.load("spark-warehouse/datatable")
#df_data.show()

In [10]:
df_data.show(10,truncate=False)

+---+----------+----------+----------+
|_c2|_c3       |Item_Index|User_Index|
+---+----------+----------+----------+
|5.0|1362268800|3562.0    |9001.0    |
|4.0|1389398400|12294.0   |1960.0    |
|4.0|1362960000|1132.0    |185.0     |
|5.0|1405468800|14353.0   |9987.0    |
|5.0|1398470400|6896.0    |6297.0    |
|5.0|1271116800|2943.0    |3256.0    |
|5.0|1363392000|813.0     |14429.0   |
|5.0|1380067200|2351.0    |11613.0   |
|3.0|1385683200|320.0     |3944.0    |
|3.0|1246406400|6591.0    |1345.0    |
+---+----------+----------+----------+
only showing top 10 rows



In [12]:
dftrain, dftest = df_data.randomSplit([0.8,0.2], 42)

dftrain.cache()

[dftrain.count(),dftest.count(),df_data.count()]

[13289, 3171, 16460]

In [None]:
dftrain.printSchema()

In [32]:
def getALSModel():
    als = ALS(maxIter=5,
          regParam=0.01,
          userCol="ReviewerID",
          itemCol="ItemID,
          ratingCol="Rating",
          coldStartStrategy="drop")

    return als.fit(dftrain)

In [33]:
try:
  # Load ALS Model if it exists
    modelpath = "ALSmodel"
    model = ALSModel.load(modelpath)
except:
    print("No model exists\nCreating Model...")
    model = getALSModel()
    print("Model created!")
    print("Saving Model...!")
    modelpath = "ALSmodel"
    model.write().overwrite().save(modelpath)
    print("Model Saved!")
else:
    print("Model Found and Loaded!")

Model Found and Loaded!


In [27]:
dftrain.printSchema()

root
 |-- _c2: double (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- Item_Index: double (nullable = true)
 |-- User_Index: double (nullable = true)



In [28]:
predictions = model.transform(dftest)
evaluator = RegressionEvaluator(metricName="rmse",
                               labelCol="Rating",
                               predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square-error = " + str(rmse))

Root-mean-square-error = 0.2112763291584686


In [29]:
! ls -la ./ALSmodel

total 20
drwxr-xr-x 5 big big 4096 mai 18 19:56 .
drwxrwxr-x 7 big big 4096 mai 18 19:59 ..
drwxr-xr-x 2 big big 4096 mai 18 17:31 itemFactors
drwxr-xr-x 2 big big 4096 mai 18 17:31 metadata
drwxr-xr-x 2 big big 4096 mai 18 17:31 userFactors


In [34]:
#users = df_itemtable.select(als.getUserCol()).distinct()
#items = df_itemtable.select(als.getItemCol()).distinct()

In [36]:
#users.show()
#items.show()
#[users.count(),items.count()]

In [35]:
items_to_recommend = 5
users_recommendations = model.recommendForAllUsers(items_to_recommend)

users_to_recommend = 5
items_recommendations = model.recommendForAllItems(users_to_recommend)

In [37]:
users_recommendations.show()
items_recommendations.show()

+----------+--------------------+
|User_Index|     recommendations|
+----------+--------------------+
|      3175|[[1, 5.932409], [...|
|      3997|[[2, 4.9202595], ...|
|      4935|[[268, 5.380145],...|
|      9427|[[256, 5.6797237]...|
|      9465|[[415, 5.9525847]...|
|     10817|[[10209, 4.411932...|
|     11317|[[0, 6.2972293], ...|
|     12027|[[56, 5.485071], ...|
|     13285|[[32, 9.136136], ...|
|     15447|[[27, 5.2653947],...|
|     15727|[[92, 3.9992356],...|
|     15957|[[265, 5.8108306]...|
|       148|[[448, 5.706791],...|
|      1088|[[60, 5.864487], ...|
|      1238|[[363, 3.9992745]...|
|      3794|[[20, 4.3474], [1...|
|      3918|[[207, 5.9743066]...|
|      5518|[[424, 6.5925517]...|
|      6654|[[394, 5.543542],...|
|      7554|[[418, 5.799959],...|
+----------+--------------------+
only showing top 20 rows

+----------+--------------------+
|Item_Index|     recommendations|
+----------+--------------------+
|       148|[[14406, 3.997264...|
|       471|[[6374, 5.

In [38]:
users_recommendations.write.mode("overwrite").saveAsTable("UserRecommendationTable")
items_recommendations.write.mode("overwrite").saveAsTable("ItemsRecommendationTable")