In [1]:
# !pip install pyspark 
# !pip install -U -q PyDrive
# !apt install openjdk-8-jdk-headless -qq

In [2]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
import pandas as pd
from pyspark.ml.feature import VectorAssembler, HashingTF, IDF, Normalizer, StopWordsRemover
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, isnan, when, trim

import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType

In [4]:
movies = spark.read.csv('movies.csv',inferSchema=True, header =True)
ratings = spark.read.csv('ratings.csv',inferSchema=True, header =True)

In [5]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [6]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [7]:
# ratings = ratings.drop("timestamp")
# ratings.show()

In [8]:
# Group data by userId, count ratings
userId_ratings = ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



In [9]:
# Group data by userId, count ratings
movieId_ratings = ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|     50|  204|
|   2858|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



**Build up the content-based filtering algorithm with pairwise approach in TF-IDF vector space**

The approach is based on the solution here:

https://stackoverflow.com/questions/46758768/calculating-the-cosine-similarity-between-all-the-rows-of-a-dataframe-in-pyspark

In [10]:
df = movies.select("movieId", "genres").withColumn("genres", psf.split( psf.lower(movies.genres), '\|') )
df.show()
remover = StopWordsRemover(inputCol="genres", outputCol="filtered")
df = remover.transform(df)

+-------+--------------------+
|movieId|              genres|
+-------+--------------------+
|      1|[adventure, anima...|
|      2|[adventure, child...|
|      3|   [comedy, romance]|
|      4|[comedy, drama, r...|
|      5|            [comedy]|
|      6|[action, crime, t...|
|      7|   [comedy, romance]|
|      8|[adventure, child...|
|      9|            [action]|
|     10|[action, adventur...|
|     11|[comedy, drama, r...|
|     12|    [comedy, horror]|
|     13|[adventure, anima...|
|     14|             [drama]|
|     15|[action, adventur...|
|     16|      [crime, drama]|
|     17|    [drama, romance]|
|     18|            [comedy]|
|     19|            [comedy]|
|     20|[action, comedy, ...|
+-------+--------------------+
only showing top 20 rows



Compute TF-IDF:

In [11]:
hashingTF = HashingTF(inputCol="filtered", outputCol="tf")
tf = hashingTF.transform(df)
idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)

Compute L2 norm:

In [12]:
normalizer = Normalizer(inputCol="tfidf", outputCol="norm")
data = normalizer.transform(tfidf)

Compute matrix product (cos_similarity):

In [13]:
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())

cos_similarity = data.alias("i").join(data.alias("j"), psf.col("i.movieId") < psf.col("j.movieId"))\
                     .select(
                         psf.col("i.movieId").alias("i"), 
                         psf.col("j.movieId").alias("j"), 
                         dot_udf("i.norm", "j.norm").alias("dot")).sort("i", "j")

**Build up the Alternating Least Square (ALS) matrix factorization model in collaborative filtering algorithm**

In [14]:
#Split training and testing data
train_data,test_data = ratings.randomSplit([0.8,0.2])

als = ALS(userCol='userId',itemCol='movieId',ratingCol='rating',coldStartStrategy="drop")


paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [1, 0.1, 0.01]) \
    .addGrid(als.rank, [10, 20]) \
    .build()


In [15]:
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="mae"),
                          numFolds=3)

In [16]:
cvModel = crossval.fit(train_data)

In [17]:
best_rank = cvModel.bestModel._java_obj.parent().getRank()
best_regParam = cvModel.bestModel._java_obj.parent().getRegParam()
best_model_params = {'rank': best_rank, 'regParam': best_regParam}

In [18]:
pred = cvModel.transform(test_data)

In [19]:
pred.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   148|    356|   4.0|1482548476| 3.8396535|
|   148|   4308|   4.0|1482548613| 3.5370862|
|   148|  50872|   3.0|1482548504| 3.7636023|
|   148| 122886|   3.5|1482548686| 3.1640668|
|   463|    296|   4.0|1145460490| 4.3548927|
|   463|    520|   4.0|1145459361| 3.0593896|
|   463|   1088|   3.5|1145460096| 2.9731326|
|   463|   2167|   3.0|1145460039| 3.7704568|
|   463|   7320|   4.0|1145460102|  3.112457|
|   471|   6016|   4.0|1496671906| 3.9387558|
|   471|   6377|   4.0|1496671863| 3.7320418|
|   471|   8636|   2.5|1496671897| 3.2422824|
|   471|   8961|   3.5|1496671872| 3.6962957|
|   471|  68157|   4.0|1496671950|  3.833927|
|   471|  92259|   4.5|1496668933| 3.5617995|
|   496|    858|   5.0|1415166629|  4.249449|
|   496|   1221|   4.0|1415166620|  4.067525|
|   496|   8640|   2.0|1415165826| 2.7506745|
|   496|  40826|   4.0|1415166154|

In [20]:
def get_movieId( movie_name, movies_data ):
    """
    return the movieId which is corresponding to the movie name

    Parameters
    ----------
    movie_name: string, the name of the movie w/ or w/o the year

    movies_data: spark Dataframe, movies data with columns 'movieId','title'

    Return
    ------
    the movieId
    """


    movieIds = []
    for movie in movie_name:
      Ids = movies_data.filter(movies_data.title.like('%{}%'.format(movie)) ).select('movieId').collect()
      movieIds = list(set(movieIds + [ row.movieId for row in Ids ]  ))
    return movieIds

**Make movie recommendation (item-based)**

In [21]:
def make_recommendation_item_based(similarity_matrix, ratings_data, movies_data,
                        fav_movie, n_recommendations, userId=-99 ):
    """
    Make top n movie recommendations. Currently, the movies the old user have watched are not excluded in the recommendation list yet.
    Parameters
    ----------

    similarity_matrix: spark Dataframe, the similarity matrix with columns 'i','j','dot'

    ratings_data: spark Dataframe, ratings data with columns 'userId', 'movieId', 'rating' 

    movies_data: spark Dataframe, movies data with columns 'movieId','title'

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            if userId = -99, the new user will be created
            if userId = -1, the latest inserted user is chosen

    """  
  
    movieIds = get_movieId(fav_movie, movies_data )

    if (userId == -99):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0] + 1
    elif (userId == -1):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0]

  
    df_similar_movieIds = similarity_matrix.filter( similarity_matrix.i.isin(movieIds) ).select('i','j','dot') 
  
    df_similar_movieIds = df_similar_movieIds.filter( ~similarity_matrix.j.isin(movieIds) ).select('i','j','dot')

    df_similar_movieIds = df_similar_movieIds.groupBy('j').agg( {'dot':'max'} ).select(col('j').alias('movieId'), col('max(dot)').alias('dot_max'))
 
    topn_predictions = df_similar_movieIds.orderBy('dot_max', ascending=False).limit(10)

    Ids = topn_predictions.select('movieId').collect()
    Ids = [ row.movieId for row in Ids ]
    topn_movies = movies_data.filter( movies_data.movieId.isin(Ids) ).select( 'title' )

#    The following line is better, but it will produce error message...
#    movieId#14 are ambiguous. It's probably because you joined several Datasets together, and some of these......
#    topn_movies = movies_data.join( topn_predictions, topn_predictions.movieId == movies_data.movieId ).orderBy( 'dot_max', ascending=False ).select( 'title' )

    return [row.title for row in topn_movies.collect()]

**Make movie recommendation (user-based)**

In [22]:
def make_recommendation_user_based(best_model_params, ratings_data, movies_data,
                        fav_movie, n_recommendations, userId=-99 ):
    """
    make top n movie recommendations
    Parameters
    ----------

    best_model_params: dict, the best parameters of the model from the CrossValidator

    ratings_data: spark Dataframe, ratings data with columns 'userId', 'movieId', 'rating' 

    movies_data: spark Dataframe, movies data with columns 'movieId','title'

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            if userId = -99, the new user will be created
            if userId = -1, the latest inserted user is chosen

    """

    movieIds = get_movieId(fav_movie, movies_data )

    if (userId == -99):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0] + 1
    elif (userId == -1):
      userId = ratings_data.agg({"userId": "max"}).collect()[0][0]

    max_rating = ratings_data.agg({"rating": "max"}).collect()[0][0]

    # build up the train data, which is the original data + the new inserted data.
    # We assume that the inserted favorate movie has the highest rating.
    train_data = ratings_data
    for movieId in movieIds:
      new_rows = spark.createDataFrame([(userId,movieId,max_rating,0)], ['userId', 'movieId', 'rating', 'timestamp'])
      train_data = ratings_data.union(new_rows)

    # train best ALS
    als = ALS(userCol='userId',itemCol='movieId',ratingCol='rating', \
              rank=best_model_params.get('rank'), \
              regParam=best_model_params.get('regParam'))

    model = als.fit( train_data )
    df_newuser = movies_data.filter(~movies_data.movieId.isin(movieIds)).select('movieId').withColumn("userId", lit(userId))

    predictions = model.transform(df_newuser)

    def to_null(c):
      return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")), col(c))
    
    predictions = predictions.select([to_null(c).alias(c) for c in predictions.columns]).na.drop()

    topn_predictions = predictions.orderBy('prediction', ascending=False).limit(n_recommendations)
    topn_ids = topn_predictions.select('userId')
    topn_movies = movies_data.join( topn_predictions, topn_predictions.movieId == movies_data.movieId ).orderBy( 'prediction', ascending=False ).select( 'title' )

    return [row.title for row in topn_movies.collect()]
    

In [23]:
my_favorite_movies = [input("Movie: ")]

Movie: Iron Man


In [24]:
#my_favorite_movies = ['Iron Man']
# my_favorite_movies = ['Genius Party']
# get recommends
n_recommendations = 10
recommends_item_based = make_recommendation_item_based(similarity_matrix = cos_similarity, ratings_data = ratings, movies_data = movies,
                        fav_movie = my_favorite_movies, n_recommendations = n_recommendations )

print("--------------Search based on similarity between movies--------------------------------------")
print('The users like' , my_favorite_movies , 'also like:')
for i, title in enumerate(recommends_item_based):
    print(i+1, title)
if( len(recommends_item_based) < n_recommendations ):
  print("Sadly, we couldn't offer so many recommendations :(")

recommends_user_based = make_recommendation_user_based(best_model_params = best_model_params, ratings_data = ratings, movies_data = movies,
                        fav_movie = my_favorite_movies, n_recommendations = n_recommendations )

print("--------------Search based on similarity between user's preference--------------------------------------")
print('The users like' , my_favorite_movies , 'also like:')
for i, title in enumerate(recommends_user_based):
    print(i+1, title)
if( len(recommends_user_based) < n_recommendations ):
  print("Sadly, we couldn't offer so many recommendations :(")

--------------Search based on similarity between movies--------------------------------------
The users like ['Iron Man'] also like:
1 G.I. Joe: Retaliation (2013)
2 Nasu: Summer in Andalusia (2003)
3 Genius Party (2007)
4 Gifted (2017)
5 The Death of Louis XIV (2016)
6 Investigation Held by Kolobki (1986)
7 Bliss (2012)
8 Fireworks, Should We See It from the Side or the Bottom? (2017)
9 Wonder (2017)
10 Love Live! The School Idol Movie (2015)
--------------Search based on similarity between user's preference--------------------------------------
The users like ['Iron Man'] also like:
1 On the Beach (1959)
2 Saving Face (2004)
3 Strictly Sexual (2008)
4 Babes in Toyland (1934)
5 Raiders of the Lost Ark: The Adaptation (1989)
6 Adam's Rib (1949)
7 Imposter, The (2012)
8 Star Wars: Episode VII - The Force Awakens (2015)
9 Belle époque (1992)
10 Cosmos
