## Part 0: Environment Setup and Data Loading

### Part 0.1: Install Packages and Import Libraries

In [None]:
# JVM and Spark download
!apt-get install openjdk-8-jdk-headless -qq > /dev/null  
!wget -q https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz
# Enviornment configuration
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.3.0-bin-hadoop3'
# Find Spark
!pip install -q findspark
import findspark
findspark.init()
findspark.find()
# Create a Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql.functions import col
spark

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline

### Part 0.2: Load Data

In [3]:
# Mount Google Drive Folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load datasets
path_movies = '/content/drive/MyDrive/Projects/Datasets/Movie/Netflix_Dataset_Movie.csv'
path_ratings = '/content/drive/MyDrive/Projects/Datasets/Movie/Netflix_Dataset_Rating.csv'
df_movies = spark.read.load(path_movies, format='csv', header = True)
df_ratings = spark.read.load(path_ratings, format='csv', header = True)

### Part 0.3: View the Raw Data

In [5]:
# Show movies
df_movies.show(10)

+--------+----+--------------------+
|Movie_ID|Year|                Name|
+--------+----+--------------------+
|       1|2003|     Dinosaur Planet|
|       2|2004|Isle of Man TT 20...|
|       3|1997|           Character|
|       4|1994|Paula Abdul's Get...|
|       5|2004|The Rise and Fall...|
|       6|1997|                Sick|
|       7|1992|               8 Man|
|       8|2004|What the #$*! Do ...|
|       9|1991|Class of Nuke 'Em...|
|      10|2001|             Fighter|
+--------+----+--------------------+
only showing top 10 rows



In [6]:
# Number of movies
print('Number of rows in movie dataset: {}'.format(df_movies.count()))
print('Number of distinct movies in movie dataset: {}'.format(df_movies.select('Movie_ID').distinct().count()))

Number of rows in movie dataset: 17770
Number of distinct movies in movie dataset: 17770


In [7]:
# Show ratings
df_ratings.show(10)

+-------+------+--------+
|User_ID|Rating|Movie_ID|
+-------+------+--------+
| 712664|     5|       3|
|1331154|     4|       3|
|2632461|     3|       3|
|  44937|     5|       3|
| 656399|     4|       3|
| 439011|     1|       3|
|1644750|     3|       3|
|2031561|     4|       3|
| 616720|     4|       3|
|2467008|     4|       3|
+-------+------+--------+
only showing top 10 rows



In [8]:
# Number of ratings
print('Number of rows in the rating dataset: {}'.format(df_ratings.count()))
print('Number of distinct movie-user pairs in rating dataset: {}'.format(df_ratings.select('Movie_ID', 'User_ID').distinct().count()))

Number of rows in the rating dataset: 17337458
Number of distinct movie-user pairs in rating dataset: 17337458


## Part 2: Model Training and Evaluation

### Part 2.1: Data Processing

In [9]:
from pyspark.sql.types import IntegerType, FloatType

# Data type conversion
# User ID, movie ID, Year -> Integer
# Movie rating -> Float
df_movies = df_movies.withColumn("Movie_ID", df_movies["Movie_ID"].cast(IntegerType()))
df_movies = df_movies.withColumn("Year", df_movies["Year"].cast(IntegerType()))
df_movies.printSchema()
df_ratings = df_ratings.withColumn("User_ID", df_ratings["User_ID"].cast(IntegerType()))
df_ratings = df_ratings.withColumn("Movie_ID", df_ratings["Movie_ID"].cast(IntegerType()))
df_ratings = df_ratings.withColumn("Rating", df_ratings["Rating"].cast(FloatType()))
df_ratings.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Movie_ID: integer (nullable = true)



In [10]:
# Train-test split with ratio 8:2
train, test = df_ratings.randomSplit([0.8, 0.2], seed=1)

### Part 2.2: Model Training and Hyperparameter Tuning

In [11]:
from pyspark.ml.recommendation import ALS

# Create model
model_als = ALS(
    rank=10, maxIter=5, regParam=1e-2, alpha=1e-2, 
    userCol="User_ID", itemCol="Movie_ID", ratingCol="Rating",
    coldStartStrategy="drop", 
    seed=1
)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Find optimal hyperparamters by grid search cross validation
params = ParamGridBuilder().addGrid(model_als.rank, [5, 10, 15]) \
                           .addGrid(model_als.maxIter, [5, 10, 15]) \
                           .addGrid(model_als.regParam, [1e-2, 1e-1, 1]) \
                           .addGrid(model_als.alpha, [1e-3, 1e-2, 1e-1]) \
                           .build()
# RMSE evaluator
evaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="Rating",
    metricName="rmse"
)
# Cross validation 
cv = CrossValidator(
    estimator=model_als,
    estimatorParamMaps=params,
    evaluator=evaluator,
    numFolds=3,
    seed=1
)
# Model fitting
cv_model = cv.fit(train)

In [None]:
# The best set of model parameters
best_params = cv_model.getEstimatorParamMaps()[np.argmin(cv_model.avgMetrics)]
print('Best Set of Model Parameters')
for param, value in best_params.items():
    print('{}: {}'.format(param.name, value))

In [None]:
# Retrain the best model using the entire training set
# Extract best model from the tuning exercise using ParamGridBuilder
pred_train = cv_model.transform(train)
rmse_train = evaluator.evaluate(pred_train)
print("Training Root Mean Square Error = {}".format(rmse_train))

### Part 2.3: Model Testing

In [None]:
# Test RMSE
pred_test = cv_model.transform(test)
rmse_test = evaluator.evaluate(pred_test)
print("Test Root Mean Square Error = {}".format(rmse_test))

In [None]:
pred_test.show(10)

### Part 2.4: Refit the Model to the Entire Dataset

In [None]:
# Get best model from cross validation result
best_model = cv_model.bestModel
# Create a new ALS model with the optimal parameter setting
new_model_als = ALS(
    rank=best_model._java_obj.parent().getRank(),
    maxIter=best_model._java_obj.parent().getMaxIter(),
    regParam=best_model._java_obj.parent().getRegParam(),
    alpha=best_model._java_obj.parent().getAlpha(), 
    userCol="User_ID", itemCol="Movie_ID", ratingCol="Rating",
    coldStartStrategy="drop", 
    seed=1
)
# Train the new model using the entire dataset
fully_trained_model = new_model_als.fit(df_ratings)

### Part 2.5: Save the Model

In [None]:
# Save the model
model_path = '/content/drive/MyDrive/Projects/Movie_Recommendation_System/model_als/'
fully_trained_model.save(model_path)