## Part 0: Environment Setup and Data Loading

### Part 0.1: Install Packages and Import Libraries

In [1]:
# JVM and Spark download
!apt-get install openjdk-8-jdk-headless -qq > /dev/null  
!wget -q https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz
# Enviornment configuration
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.3.0-bin-hadoop3'
# Find Spark
!pip install -q findspark
import findspark
findspark.init()
findspark.find()
# Create a Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql.functions import col
spark

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline

### Part 0.2: Load Data

In [3]:
# Mount Google Drive Folder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Load datasets
path_movies = '/content/drive/MyDrive/Projects/Datasets/Movie/Netflix_Dataset_Movie.csv'
path_ratings = '/content/drive/MyDrive/Projects/Datasets/Movie/Netflix_Dataset_Rating.csv'
df_movies = spark.read.load(path_movies, format='csv', header = True)
df_ratings = spark.read.load(path_ratings, format='csv', header = True)

### Part 0.3: View the Raw Data

In [5]:
# Show movies
df_movies.show(10)

+--------+----+--------------------+
|Movie_ID|Year|                Name|
+--------+----+--------------------+
|       1|2003|     Dinosaur Planet|
|       2|2004|Isle of Man TT 20...|
|       3|1997|           Character|
|       4|1994|Paula Abdul's Get...|
|       5|2004|The Rise and Fall...|
|       6|1997|                Sick|
|       7|1992|               8 Man|
|       8|2004|What the #$*! Do ...|
|       9|1991|Class of Nuke 'Em...|
|      10|2001|             Fighter|
+--------+----+--------------------+
only showing top 10 rows



In [6]:
# Number of movies
print('Number of rows in movie dataset: {}'.format(df_movies.count()))
print('Number of distinct movies in movie dataset: {}'.format(df_movies.select('Movie_ID').distinct().count()))

Number of rows in movie dataset: 17770
Number of distinct movies in movie dataset: 17770


In [7]:
# Show ratings
df_ratings.show(10)

+-------+------+--------+
|User_ID|Rating|Movie_ID|
+-------+------+--------+
| 712664|     5|       3|
|1331154|     4|       3|
|2632461|     3|       3|
|  44937|     5|       3|
| 656399|     4|       3|
| 439011|     1|       3|
|1644750|     3|       3|
|2031561|     4|       3|
| 616720|     4|       3|
|2467008|     4|       3|
+-------+------+--------+
only showing top 10 rows



In [8]:
# Number of ratings
print('Number of rows in the rating dataset: {}'.format(df_ratings.count()))
print('Number of distinct movie-user pairs in rating dataset: {}'.format(df_ratings.select('Movie_ID', 'User_ID').distinct().count()))

Number of rows in the rating dataset: 17337458
Number of distinct movie-user pairs in rating dataset: 17337458


### Part 0.4: Data Type Conversion

In [9]:
from pyspark.sql.types import IntegerType, FloatType

# Data type conversion
# User ID, movie ID -> Integer
# Movie rating -> Float
df_movies = df_movies.withColumn("Movie_ID", df_movies["Movie_ID"].cast(IntegerType()))
df_movies = df_movies.withColumn("Year", df_movies["Year"].cast(IntegerType()))
df_movies.printSchema()
df_ratings = df_ratings.withColumn("User_ID", df_ratings["User_ID"].cast(IntegerType()))
df_ratings = df_ratings.withColumn("Movie_ID", df_ratings["Movie_ID"].cast(IntegerType()))
df_ratings = df_ratings.withColumn("Rating", df_ratings["Rating"].cast(FloatType()))
df_ratings.printSchema()

root
 |-- Movie_ID: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Name: string (nullable = true)

root
 |-- User_ID: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Movie_ID: integer (nullable = true)



## Part 3: Model Applications

### Part 3.0: Load Model

In [10]:
from pyspark.ml.recommendation import ALS, ALSModel

model_path = '/content/drive/MyDrive/Projects/Movie_Recommendation_System/model/'
model = ALSModel.load(model_path)

### Part 3.1: Find Similar Movies of a Movie

- ALS model decomposes the rating matrix into a user feature matrix and a movie feature matrix. 

<p align="center">
<img align="center" src="https://drive.google.com/uc?export=view&id=1KtMMnXW4qbADzotLPG5MovVVW4ojegvR" height="300"/>
</p>

- By utilizing the movie features derived from the model, the similarity of different movies can be measured.
- There are several ways to determine how different a movie is from another, including the Euclidean distance and the cosine similarity of the movie feature vectors. The latter one is implemented below.

In [11]:
# Number of features of a movie, specified by the rank of the ALS model
num_features = model.rank
# Get movie features
movie_features = model.itemFactors
# Convert the features to individual columns
movie_features = movie_features.rdd \
    .map(lambda row: (row[0],) + tuple(row[1])) \
    .toDF(['Movie_ID'] + ['Feature{}'.format(i) for i in range(num_features)])
movie_features.createOrReplaceTempView('movie_features')
movie_features.toPandas().head(5)

Unnamed: 0,Movie_ID,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,Feature11,Feature12,Feature13,Feature14
0,32,-0.04289,0.517238,0.619658,0.905652,0.224716,0.597838,0.110501,-0.524302,-0.076709,-0.399856,0.791513,-0.756893,-0.137386,0.270466,0.886297
1,52,0.841939,0.451016,0.536448,0.11468,0.464422,-0.087567,-0.308616,-1.05833,-0.108652,-0.36496,0.496399,-0.250053,0.066643,-0.044197,0.460208
2,122,0.501497,0.116985,-0.053062,0.488991,0.617323,0.430436,-0.204783,-0.716217,0.104865,-0.358799,0.460777,-0.328718,0.057209,0.233919,0.086399
3,152,0.456784,0.629339,0.662633,0.625439,0.191476,0.440126,-0.188744,-0.928863,0.042489,-0.366424,0.542688,-0.491919,0.043069,0.072368,0.432372
4,232,0.330226,0.388321,0.325928,0.629805,0.13261,0.258383,-0.157101,-0.722242,0.233883,-0.256399,0.713322,-0.55612,-0.059002,0.227679,0.102587


In [12]:
# Use cosine similarity to find similar movies of some movie.
def find_similar_movies(movie_id, num_movies):
    '''
    Find similar movies of a movie. Movies' similarity is measured by the cosine
    similarity between the movie feature vectors.

    Parameters:
        movie_id (int): id of the movie to investigate
        num_movies (int): the number of similar movies to return

    Returns:
        result (Pandas DataFrame): the most similar movies to the given movie
    '''
    # The Spark SQL query to retrieve the feature vector of this movie
    query = '''
        SELECT *
        FROM movie_features
        WHERE Movie_ID = {}
    '''.format(movie_id)
    feature = spark.sql(query)
    # If movie is not found, print failure message.
    if feature.count == 0:
        print('Movie ID {} not found in dataset.'.format(movie_id))
        return None
    
    # Cosine similarity is given by
    #     cos<v1, v2> = v1 · v2 / ||v1|| ||v2||
    # in which v1, v2 are the feature vectors of the two movies to compare.
    # v1: feature vector of the movie with ID movie_id
    v1 = feature.collect()[0][1:]
    # v1_norm: the norm of v1
    v1_norm = sum([v1_i * v1_i for v1_i in v1]) ** 0.5
    
    # Compute consine similarities between all other movies and this movie
    similarities = movie_features.rdd \
        .filter(lambda row: row[0] != movie_id) \
        .map(
            lambda row: (
                row[0],  # Movie id
                sum([v1_i * v2_i for v1_i, v2_i in zip(v1, row[1:])]) /   # Inner product
                (v1_norm * sum([v2_i * v2_i for v2_i in row[1:]]) ** 0.5)  # Product of norms
            )
        ) \
        .sortBy(lambda row: row[1], ascending=False)  \
        .toDF(['Movie_ID', 'Similarity'])
    
    # Retrive the most similar movies and get their info: movie id, year, name
    result = df_movies.join(
        similarities.limit(num_movies),
        df_movies.Movie_ID == similarities.Movie_ID
    ) \
    .select(df_movies['Movie_ID'], 'Year', 'Name', 'Similarity') \
    .orderBy('Similarity', ascending=False) \
    .toPandas()
    
    return result

In [13]:
movie_id = 1020
movie_name = df_movies.filter(df_movies.Movie_ID == movie_id).collect()[0][2]
print('The movie with ID {} is: \n{}'.format(movie_id, movie_name))

The movie with ID 1020 is: 
The Simpsons: Season 1


In [14]:
num_movies = 5
print('{} movies that are similar to {}'.format(num_movies, movie_name))
find_similar_movies(movie_id, num_movies)

5 movies that are similar to The Simpsons: Season 1


Unnamed: 0,Movie_ID,Year,Name,Similarity
0,3046,1990,The Simpsons: Treehouse of Horror,0.999409
1,2172,1991,The Simpsons: Season 3,0.998042
2,4115,1999,The Simpsons: Bart Wars,0.997793
3,2102,1994,The Simpsons: Season 6,0.997486
4,3444,2004,Family Guy: Freakin' Sweet Collection,0.991435


- The five movies similar to <em>The Simpsons: Season 1</em> are:
    - <em>The Simpsons: Treehouse of Horror</em>
    - <em>The Simpsons: Season 3</em>
    - <em>The Simpsons: Bart Wars</em>
    - <em>The Simpsons: Season 6</em>
    - <em>Family Guy: Freakin' Sweet Collection</em>

- Four of them are in <em>The Simpsons</em> series, indicating that cosine similarity between movie feature vectors is a good indicator of movie similarity.

### Part 3.2: Recommend Movies to a Current User

- This section attempts to generate movie recommendations for a existing user in the dataset, say user #1331154. Before recommending movies to this user, let's first have a look at some of his/her favorite movies.

In [15]:
def get_current_favorite_movies_of_user(user_id, recommendation):
    '''
    Retrieve some favorite movies of a user. Favorite movies of a user are the 
    movies to which the user gave the highest rating (mostly 5). Movies are
    sorted by the rating given by this user, and the top few movies are returned.

    Parameters:
        user_id (int): the id of the user to investigate
        num_movies (int): the number of favorite movies to check

    Returns:
        favorite_movies (Pandas DataFrame): a dataframe of the user's favorite movies
    '''
    # The ratings given by this user
    df_user_subset = df_ratings.where(df_ratings.User_ID == user_id)
    # The movies to which the user gave the highest rating
    favorite_movies = df_user_subset.join(df_movies, df_user_subset.Movie_ID == df_movies.Movie_ID) \
                                    .orderBy('Rating', ascending=False) \
                                    .select(df_user_subset['Movie_ID'], 'Year', 'Name', 'Rating') \
                                    .limit(num_movies).toPandas()
    # If no movies found, print failure message.
    if favorite_movies.shape[0] == 0:
        print('User ID {} not found in dataset.'.format(user_id))
        return None
    # Return the dataframe if the movies are found.
    return favorite_movies

In [16]:
user_id = 1331154
print('The {} favorite movies of user {}'.format(num_movies, user_id))
get_current_favorite_movies_of_user(user_id, num_movies)

The 5 favorite movies of user 1331154


Unnamed: 0,Movie_ID,Year,Name,Rating
0,2795,2001,The X-Files: Season 9,5.0
1,3650,1983,War Games,5.0
2,2913,2004,Finding Neverland,5.0
3,2743,2002,The Pianist,5.0
4,3333,2004,The Village,5.0


- Now recommend some movies to user #1331154.

In [17]:
def recomend_movies_to_user(user_id, num_recommendations, model):
    '''
    Recommend movies to an existing user in the dataset using the given model.

    Parameters:
        user_id (int): the id of the user to recommend movies
        num_recommendations (int): the number of movies to recommend
        model (ALSModel): the recommendation model

    Returns:
        result (Pandas DataFrame): a dataframe of the recommended movies
    '''
    # The ratings given by this user
    df_user_subset = df_ratings.where(df_ratings.User_ID == user_id)
    # Use the model to give movie recommendations
    recommendations = model.recommendForUserSubset(df_user_subset, num_recommendations).toPandas()
    # If no movies found, print failure message.
    if recommendations.shape[0] == 0:
        print('User ID {} not found in dataset.'.format(user_id))
        return None
    # Get movie ids and the predicted ratings of the recommended movies
    recommendations = recommendations.iloc[0, 1]
    recommendations = pd.DataFrame(recommendations, columns=['Movie_ID', 'predicted_ratings'])
    # The info of all recommended movies: movie id, year, name
    movies_info = None
    # For each recommended movie
    for movie_id in recommendations['Movie_ID']:
        # Get the movie info: movie id, year, name
        movie_info = df_movies.where(df_movies.Movie_ID == movie_id)
        # Append the new info to the end of movies_info
        movies_info = movies_info.union(movie_info) if movies_info else movie_info
    # Add another column to show the predicted ratings of the recommended movies of this user
    result = pd.concat(
        [movies_info.toPandas(), recommendations['predicted_ratings']],
        axis=1
    )
    result.index = range(1, num_recommendations + 1)
    return result

In [18]:
# Recommend 5 movies to user 1331154
num_recommendations = 5
print('The {} movies recommended to user {}'.format(num_recommendations, user_id))
recommendations = recomend_movies_to_user(user_id, num_recommendations, model)
recommendations

The 5 movies recommended to user 1331154


Unnamed: 0,Movie_ID,Year,Name,predicted_ratings
1,3456,2004,Lost: Season 1,4.632281
2,4427,2001,The West Wing: Season 3,4.550875
3,1476,2004,Six Feet Under: Season 4,4.455322
4,1947,2002,Gilmore Girls: Season 3,4.390172
5,3962,2003,Finding Nemo (Widescreen),4.382888


### Part 3.3: Recommend Movies to a New User and Model Retraining

- Suppose we would like to recommend movies to a new user, whose movies ratings are not included in the training dataset we used before.
- What we know now is just the IDs of the movies that this new user has just watched.

In [19]:
# The movies that the new user just watched
movie_ids = [1020, 3456, 3650]

- First have a look at the detailed information about these movies.

In [20]:
df_movies.filter(df_movies.Movie_ID.isin(movie_ids)) \
         .orderBy('Movie_ID', ascending=True) \
         .show(truncate=False)

+--------+----+----------------------+
|Movie_ID|Year|Name                  |
+--------+----+----------------------+
|1020    |1989|The Simpsons: Season 1|
|3456    |2004|Lost: Season 1        |
|3650    |1983|War Games             |
+--------+----+----------------------+



- Now define a function to make recommendations to a new user.

In [21]:
from pyspark.sql.functions import lit

def augment_dataset(new_user_id, movie_ids, df_movies, df_ratings):
    '''
    Augment the rating dataset by adding the information of the new user. New
    rows are appended to the original rating DataFrame to accommodate the new 
    user.

    Parameters:
        new_user_id (int): the id assigned to the new user
        movie_ids (list): the id list of the movies that the new user watched
        df_movies (Spark DataFrame): the movie dataset
        df_ratings (Spark DataFrame): the original rating dataset

    Returns:
        augmented_ratings (Spark DataFrame): the augmented rating dataset
    '''
    # Check the validity of the movie ids
    # The movie has to be in the dataset
    movie_ids = [
        movie_id for movie_id in movie_ids
        if df_movies.filter(df_movies.Movie_ID == 3333).count()
    ]
    # No duplicates
    movie_ids = list(set(movie_ids))
    # The maximum movie rating given by the previous users (5 in this case)
    max_rating = df_ratings.agg({'Rating': 'max'}).collect()[0][0]
    
    # Generate rows for the new user
    # It is assumed that the user will give the max rating to the movies watched
    new_rows = [(new_user_id, max_rating, movie_id) for movie_id in movie_ids]
    columns = ['User_ID', 'Rating', 'Movie_ID']
    appendix = spark.createDataFrame(new_rows, columns)
    # Append the new rows to the original rating dataframe
    augmented_ratings = df_ratings.union(appendix)

    return augmented_ratings

def dataset_for_prediction(new_user_id, movie_ids, df_movies):
    '''
    Build a dataframe for model prediction. The dataframe has two columns.
    One is User_ID, which has the ID of the new user as its only value.
    The other is Movie_ID, which contains all the movie ids except those that
    have been watched by the new user.

    Parameters:
        new_user_id (int): the id assigned to the new user
        movie_ids (list): the id list of the movies that the user watched
        df_movies (Spark DataFrame): the movie dataset

    Returns:
        result (Spark DataFrame): the dataframe for rating prediction
    '''
    # DataFrame construction
    result = df_movies.select('Movie_ID') \
                      .distinct() \
                      .filter(~df_movies.Movie_ID.isin(movie_ids)) \
                      .withColumn('User_ID', lit(new_user_id))
    # Type conversion
    result = result.withColumn("User_ID", result["User_ID"].cast(IntegerType()))
    result = result.withColumn("Movie_ID", result["Movie_ID"].cast(IntegerType()))
    return result

def recomend_movies_to_new_user(movie_ids, num_recommendations, df_movies, df_ratings, model_params):
    '''
    Recommend movies to a new user.

    Parameters:
        movie_ids (list): the id list of the movies that the user watched
        num_recommendations (int): the number of movies to recommend
        df_movies (Spark DataFrame): the movie dataset
        df_ratings (Spark DataFrame): the rating dataset
        model (dict): the ALS model parameters

    Returns:
        recommended_movies (Spark DataFrame): the recommended movies to the new user
    '''
    # Assign an id to the new user
    new_user_id = df_ratings.agg({'User_ID': 'max'}).collect()[0][0] + 1

    # Augment the rating dataset for model retraining
    augmented_ratings = augment_dataset(new_user_id, movie_ids, df_movies, df_ratings)
    
    # Retrain the ALS model
    model_als = ALS(
        rank=model_params['rank'],
        maxIter=model_params['maxIter'],
        regParam=model_params['regParam'],
        alpha=model_params['alpha'],
        userCol="User_ID", itemCol="Movie_ID", ratingCol="Rating",
        coldStartStrategy="drop", 
        seed=1
    )
    retrained_model = model_als.fit(augmented_ratings)

    # Get dataset of the new user for rating prediction
    df_ratings_to_predict = dataset_for_prediction(new_user_id, movie_ids, df_movies)
    df_predictions = retrained_model.transform(df_ratings_to_predict)

    # Get the ids of the recommended movies by sorting the predicted ratings in descending order
    recommended_movie_ids = df_predictions.orderBy('prediction', ascending=False) \
                                          .limit(num_recommendations) \
                                          .select('Movie_ID', 'prediction')
    
    # Get other information of the recommended movies: year, name
    recommended_movies = df_movies.join(
        recommended_movie_ids,
        df_movies.Movie_ID == recommended_movie_ids.Movie_ID
    ).orderBy('prediction', ascending=False) \
    .select(recommended_movie_ids['Movie_ID'], 'Year', 'Name', 'prediction')

    return recommended_movies

- Make recommendations to the new user.

In [22]:
# Recommend movies to the new user
recommendations_to_new_user = recomend_movies_to_new_user(
    movie_ids=movie_ids, 
    num_recommendations=num_recommendations,
    df_movies=df_movies,
    df_ratings=df_ratings,
    model_params={
        'rank': 15,
        'maxIter': 10,
        'regParam': 1e-1,
        'alpha': 1e-3
    }
)
print('The {} movies recommended to the new user'.format(num_recommendations))
recommendations_to_new_user.toPandas()

The 5 movies recommended to the new user


Unnamed: 0,Movie_ID,Year,Name,prediction
0,2102,1994,The Simpsons: Season 6,5.127633
1,3290,1974,The Godfather,5.051917
2,4427,2001,The West Wing: Season 3,5.002013
3,2172,1991,The Simpsons: Season 3,4.986874
4,2452,2001,Lord of the Rings: The Fellowship of the Ring,4.983238


- Comment: ALS might generate predictions that exceed the maximum rating value (5 in this case). This is acceptable since we only want a ranking of the movies that the user might like to make recommendations, and the more the predicted value exceeds 5, the more likely that the user might like this movie.