In [None]:
from google.colab import files
files.upload()
# upload movies.csv ratings.csv

In [3]:
!pip install pyspark 
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=f3475bddb0c16621c99f95616e86b73be01faa365ee9ba1229169fb109c4506f
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
openjdk-8-jdk-headless is already the newest version (8u372-ga~us1-0ubuntu1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [4]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [5]:
import pandas as pd
from pyspark.ml.feature import VectorAssembler, HashingTF, IDF, Normalizer, StopWordsRemover
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, isnan, when, trim

import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType

In [6]:
movies = spark.read.csv('movies.csv',inferSchema=True, header =True)
ratings = spark.read.csv('ratings.csv',inferSchema=True, header =True)

**Build up the content-based filtering algorithm with pairwise approach in TF-IDF vector space**

The approach is based on the solution here:

https://stackoverflow.com/questions/46758768/calculating-the-cosine-similarity-between-all-the-rows-of-a-dataframe-in-pyspark

In [7]:
df = movies.select("movieId", "genres").withColumn("genres", psf.split( psf.lower(movies.genres), '\|') )
remover = StopWordsRemover(inputCol="genres", outputCol="filtered")
df = remover.transform(df)

Compute TF-IDF:

In [8]:
hashingTF = HashingTF(inputCol="filtered", outputCol="tf")
tf = hashingTF.transform(df)
idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)

Compute L2 norm:

In [9]:
normalizer = Normalizer(inputCol="tfidf", outputCol="norm")
data = normalizer.transform(tfidf)

Compute matrix product (cos_similarity):

In [10]:
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())

cos_similarity = data.alias("i").join(data.alias("j"), psf.col("i.movieId") < psf.col("j.movieId"))\
                     .select(
                         psf.col("i.movieId").alias("i"), 
                         psf.col("j.movieId").alias("j"), 
                         dot_udf("i.norm", "j.norm").alias("dot")).sort("i", "j")

**Build up the Alternating Least Square (ALS) matrix factorization model in collaborative filtering algorithm**

In [11]:
#Split training and testing data
train_data,test_data = ratings.randomSplit([0.8,0.2])

als = ALS(userCol='userId',itemCol='movieId',ratingCol='rating',coldStartStrategy="drop")


paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [1, 0.1, 0.01]) \
    .addGrid(als.rank, [10, 20]) \
    .build()


In [12]:
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="mae"),
                          numFolds=3)

In [13]:
cvModel = crossval.fit(train_data)

In [14]:
best_rank = cvModel.bestModel._java_obj.parent().getRank()
best_regParam = cvModel.bestModel._java_obj.parent().getRegParam()
best_model_params = {'rank': best_rank, 'regParam': best_regParam}

In [15]:
pred = cvModel.transform(test_data)

In [16]:
def get_movieId( movie_name, movies_data ):
    """
    return the movieId which is corresponding to the movie name

    Parameters
    ----------
    movie_name: string, the name of the movie w/ or w/o the year

    movies_data: spark Dataframe, movies data with columns 'movieId','title'

    Return
    ------
    the movieId
    """


    movieIds = []
    for movie in movie_name:
      Ids = movies_data.filter(movies_data.title.like('%{}%'.format(movie)) ).select('movieId').collect()
      movieIds = list(set(movieIds + [ row.movieId for row in Ids ]  ))
    return movieIds

**Make movie recommendation (item-based)**

In [17]:
def make_recommendation_item_based(similarity_matrix, ratings_data, movies_data,
                        fav_movie, n_recommendations, userId=-99 ):
    """
    Make top n movie recommendations. Currently, the movies the old user have watched are not excluded in the recommendation list yet.
    Parameters
    ----------

    similarity_matrix: spark Dataframe, the similarity matrix with columns 'i','j','dot'

    ratings_data: spark Dataframe, ratings data with columns 'userId', 'movieId', 'rating' 

    movies_data: spark Dataframe, movies data with columns 'movieId','title'

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            if userId = -99, the new user will be created
            if userId = -1, the latest inserted user is chosen

    """  
  
    movieIds = get_movieId(fav_movie, movies_data )

    if (userId == -99):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0] + 1
    elif (userId == -1):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0]

  
    df_similar_movieIds = similarity_matrix.filter( similarity_matrix.i.isin(movieIds) ).select('i','j','dot') 
  
    df_similar_movieIds = df_similar_movieIds.filter( ~similarity_matrix.j.isin(movieIds) ).select('i','j','dot')

    df_similar_movieIds = df_similar_movieIds.groupBy('j').agg( {'dot':'max'} ).select(col('j').alias('movieId'), col('max(dot)').alias('dot_max'))
 
    topn_predictions = df_similar_movieIds.orderBy('dot_max', ascending=False).limit(10)

    Ids = topn_predictions.select('movieId').collect()
    Ids = [ row.movieId for row in Ids ]
    topn_movies = movies_data.filter( movies_data.movieId.isin(Ids) ).select( 'title' )

#    The following line is better, but it will produce error message...
#    movieId#14 are ambiguous. It's probably because you joined several Datasets together, and some of these......
#    topn_movies = movies_data.join( topn_predictions, topn_predictions.movieId == movies_data.movieId ).orderBy( 'dot_max', ascending=False ).select( 'title' )

    return [row.title for row in topn_movies.collect()]

**Make movie recommendation (user-based)**

In [18]:
def make_recommendation_user_based(best_model_params, ratings_data, movies_data,
                        fav_movie, n_recommendations, userId=-99 ):
    """
    make top n movie recommendations
    Parameters
    ----------

    best_model_params: dict, the best parameters of the model from the CrossValidator

    ratings_data: spark Dataframe, ratings data with columns 'userId', 'movieId', 'rating' 

    movies_data: spark Dataframe, movies data with columns 'movieId','title'

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            if userId = -99, the new user will be created
            if userId = -1, the latest inserted user is chosen

    """

    movieIds = get_movieId(fav_movie, movies_data )

    if (userId == -99):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0] + 1
    elif (userId == -1):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0]

    max_rating = ratings_data.agg({"rating": "max"}).collect()[0][0]

    # build up the train data, which is the original data + the new inserted data.
    # We assume that the inserted favorate movie has the highest rating.
    train_data = ratings_data
    for movieId in movieIds:
      new_rows = spark.createDataFrame([(userId,movieId,max_rating,0)], ['userId', 'movieId', 'rating', 'timestamp'])
      train_data = ratings_data.union(new_rows)

    # train best ALS
    als = ALS(userCol='userId',itemCol='movieId',ratingCol='rating', \
              rank=best_model_params.get('rank'), \
              regParam=best_model_params.get('regParam'))

    model = als.fit( train_data )
    df_newuser = movies_data.filter(~movies_data.movieId.isin(movieIds)).select('movieId').withColumn("userId", lit(userId))

    predictions = model.transform(df_newuser)

    def to_null(c):
      return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")), col(c))
    
    predictions = predictions.select([to_null(c).alias(c) for c in predictions.columns]).na.drop()

    topn_predictions = predictions.orderBy('prediction', ascending=False).limit(n_recommendations)
    topn_ids = topn_predictions.select('userId')
    topn_movies = movies_data.join( topn_predictions, topn_predictions.movieId == movies_data.movieId ).orderBy( 'prediction', ascending=False ).select( 'title' )

    return [row.title for row in topn_movies.collect()]
    

In [20]:
#my_favorite_movies = ['Iron Man']
my_favorite_movies = ['Genius Party']
# get recommends
n_recommendations = 10
recommends_item_based = make_recommendation_item_based(similarity_matrix = cos_similarity, ratings_data = ratings, movies_data = movies,
                        fav_movie = my_favorite_movies, n_recommendations = n_recommendations )

print("--------------Search based on similarity between movies--------------------------------------")
print('The users like' , my_favorite_movies , 'also like:')
for i, title in enumerate(recommends_item_based):
    print(i+1, title)
if( len(recommends_item_based) < n_recommendations ):
  print("Sadly, we couldn't offer so many recommendations :(")

recommends_user_based = make_recommendation_user_based(best_model_params = best_model_params, ratings_data = ratings, movies_data = movies,
                        fav_movie = my_favorite_movies, n_recommendations = n_recommendations )

print("--------------Search based on similarity between user's preference--------------------------------------")
print('The users like' , my_favorite_movies , 'also like:')
for i, title in enumerate(recommends_user_based):
    print(i+1, title)
if( len(recommends_user_based) < n_recommendations ):
  print("Sadly, we couldn't offer so many recommendations :(")

--------------Search based on similarity between movies--------------------------------------
The users like ['Genius Party'] also like:
1 Piper (2016)
2 The Red Turtle (2016)
3 Winnie the Pooh Goes Visiting (1971)
4 A Plasticine Crow (1981)
5 Cheburashka (1971)
6 Travels of an Ant (1983)
7 Wolf and Calf (1984)
8 LEGO DC Super Hero Girls: Brain Drain (2017)
9 Bunny (1998)
10 Love Live! The School Idol Movie (2015)
--------------Search based on similarity between user's preference--------------------------------------
The users like ['Genius Party'] also like:
1 Wallace & Gromit: The Best of Aardman Animation (1996)
2 Dylan Moran: Monster (2004)
3 Bill Hicks: Revelations (1993)
4 Mulholland Dr. (1999)
5 Eddie Izzard: Dress to Kill (1999)
6 On the Beach (1959)
7 Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)
8 Fist of Legend (Jing wu ying xiong) (1994)
9 Grand Day Out with Wallace and Gromit, A (1989)
10 Saving Face (2004)
