In [None]:
#LINK TO AMAZON DATASET: https://nijianmo.github.io/amazon/index.html#sample-metadata

In [None]:
import numpy as np
import pandas as pd
from random import randint

In [None]:
#Convert ReviewerID to unique integers
#new_reviewerID = random.sample(range(0, 4607047), 4607047)

#new_reviewerID = spark.createDataFrame(new_reviewerID, IntegerType()).collect()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, DoubleType, IntegerType
from pyspark.sql import functions as F

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder



spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [None]:
df = spark.read.csv('/home/luca/Downloads/ratings_Movies_and_TV.csv')

In [None]:
df.rdd.id()

In [None]:
df.schema

In [None]:
df.tail(10)

In [None]:
#Changing column names - https://stackoverflow.com/questions/34077353/how-to-change-dataframe-column-names-in-pyspark

df = df.selectExpr("_c0 as ReviewerID", "_c1 as ProductID", "_c2 as Rating", "_c3 as unixReviewTime")

In [None]:
#Items of interest are ReviewerID and ProductID

df.show()

In [None]:
df.dtypes

In [None]:
df = df.withColumn("ReviewerID", df["ReviewerID"].cast(IntegerType()))
df = df.withColumn("ProductID", df["ProductID"].cast(IntegerType()))
df = df.withColumn("Rating", df["Rating"].cast(IntegerType()))
df = df.withColumn("unixReviewTime", df["unixReviewTime"].cast(IntegerType()))

In [None]:
#Items of interest are ReviewerID and ProductID

df.show()

In [None]:
# Taken from: https://stackoverflow.com/questions/44153575/fill-na-with-random-numbers-in-pyspark
new_df = df.withColumn('ReviewerID', F.coalesce(F.col('ReviewerID'), (F.round(F.rand()*4607047)))).collect()

In [None]:
new_df = spark.createDataFrame(new_df)

In [None]:
new_df.show()

In [None]:
del(df)

In [None]:
new_df.dtypes

In [None]:
new_df = new_df.withColumn("ReviewerID", new_df["ReviewerID"].cast(IntegerType()))
new_df = new_df.withColumn("ProductID", new_df["ProductID"].cast(IntegerType()))
new_df = new_df.withColumn("Rating", new_df["Rating"].cast(DoubleType()))
new_df = new_df.withColumn("unixReviewTime", new_df["unixReviewTime"].cast(IntegerType()))

In [None]:
new_df.dtypes

In [None]:
new_df.show()

In [None]:
#MUST FIT INTEGER RANGE: -2147483648 to 2147483647 - https://spark.apache.org/docs/latest/sql-ref-datatypes.html

new_df.count()

In [None]:
(training, test) = new_df.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=5, regParam=0.01, userCol="ReviewerID", itemCol="ProductID", ratingCol="Rating",
          coldStartStrategy="nan")
model = als.fit(training)

In [None]:
"""ERROR: java.lang.IllegalArgumentException: 
ALS only supports values in Integer range for columns ReviewerID and ProductID.
Value null was not numeric."""

training.filter("ReviewerID is NULL").show()
training.filter("ProductID is NULL").show()

In [None]:
training = training.dropna()
training.filter("ReviewerID is NULL").show()
training.filter("ProductID is NULL").show()

In [None]:
als = ALS(userCol="ReviewerID", itemCol="ProductID", ratingCol="Rating",
          coldStartStrategy="drop")

In [None]:
param_grid = ParamGridBuilder()\
.addGrid(als.rank, [12, 13, 14])\
.addGrid(als.maxIter, [18, 19, 20])\
.addGrid(als.regParam, [.17, .18, .19])\
.build()

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", 
                                predictionCol="prediction")

In [None]:
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid,
                          evaluator=evaluator)

In [None]:
model = tvs.fit(training)

In [None]:
best_model = model.bestModel