<a href="https://colab.research.google.com/github/LinLin-LL/product_recommendation/blob/run_in_kaggle/ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
from google.colab import files
files.upload()

In [4]:
!ls

ratings.csv  sample_data


In [5]:
spark = SparkSession \
    .builder \
    .appName("ratings") \
    .getOrCreate()

In [6]:
df_rating = spark.read.csv('ratings.csv', header=True, inferSchema=True)
#df_rating.printSchema()

In [7]:
users = df_rating.select('user').distinct()
users = users.coalesce(1)
users = users.withColumn(
"userIntId", monotonically_increasing_id()).persist()
#users.show()

In [8]:
items = df_rating.select('item').distinct()
items = items.coalesce(1)
items = items.withColumn(
"itemIntId", monotonically_increasing_id()).persist()
#items.show()

In [9]:
ratings_w_int_ids = df_rating.join(
users, "user", "left").join(items, "item", "left")
#ratings_w_int_ids.show()

In [10]:
ratings_data = ratings_w_int_ids.select(
                                        col("userIntId").alias("userId"),
                                        col("itemIntId").alias("itemId"),
                                        col("rating"))
#ratings_data.show()

In [11]:
ratings_data.count()

70

In [12]:
# Split the ratings dataframe into training and test data
(training_data, test_data) = ratings_data.randomSplit([0.8, 0.2], seed=42)

## Fit a base model

In [13]:
als = ALS(userCol="userId", itemCol="itemId", ratingCol="rating", 
         rank = 3, maxIter = 10, regParam =0.1,
          coldStartStrategy="drop", nonnegative =True, implicitPrefs = False)

In [14]:
# Fit the mdoel to the training_data
model = als.fit(training_data)

In [15]:
# Generate predictions on test_data
predictions = model.transform(test_data)

In [16]:
# Tell Spark how to evaluate predictions
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
predictionCol="prediction")

In [19]:
# Obtain and print RMSE
rmse = evaluator.evaluate(predictions)
print('RMSE: ', rmse)

RMSE:  3.084377655027626


## Tune parameters and CV

In [45]:
# Build generic ALS model without hyperparameters
als = ALS(userCol="userId", itemCol="itemId", ratingCol="rating", 
          coldStartStrategy="drop", nonnegative = True,
          implicitPrefs = False)

# Creates a ParamGridBuilder
param_grid = ParamGridBuilder().addGrid(als.rank, [3,4,5]).addGrid(als.maxIter, [2,3,4]).addGrid(als.regParam, [.05, .1, .15]).build()

# Complete the evaluator code
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

cv = CrossValidator(estimator = als,
                    estimatorParamMaps = param_grid,
                    evaluator = evaluator,
                    numFolds = 3)



In [46]:
# Run the cv on the training data
model = cv.fit(training_data)

In [47]:
# Extract best combination of values from cross validation
best_model = model.bestModel

In [48]:
# Generate test set predictions and evaluate using RMSE
predictions = best_model.transform(test_data)
rmse = evaluator.evaluate(predictions)
# Print evaluation metrics and model parameters
print ("**Best Model**")
print ("RMSE =", rmse)
print (" Rank: ", best_model.rank)
print (" MaxIter: ", best_model._java_obj.parent().getMaxIter())
print (" RegParam: ", best_model._java_obj.parent().getRegParam())

**Best Model**
RMSE = 2.722178651695816
 Rank:  4
 MaxIter:  3
 RegParam:  0.1


In [49]:
# Generate predictions on the test_data
test_predictions = best_model.transform(test_data)
test_predictions.collect()

[Row(userId=44, itemId=12, rating=5, prediction=4.929352283477783),
 Row(userId=34, itemId=9, rating=5, prediction=1.1509063243865967)]