<a href="https://colab.research.google.com/github/JonahFlateman/dsc-project-template/blob/master/mod4_project-colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let's start by using surprise to test some simple models.

In [2]:
import pandas as pd
df = pd.read_csv('/content/ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# Drop unnecessary columns
new_df = df.drop(columns='timestamp')

Using Reader and Dataset classes to transform dataset for surprise compatability.

In [5]:
!pip install surprise
from surprise import Reader, Dataset
# read in values as Surprise dataset
reader = Reader()
data = Dataset.load_from_df(new_df, reader)

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 9.0 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617797 sha256=497bd1cd4218ff368804aa23e60e91755e31c4d7b3a5699049e8aeaadec3411b
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [6]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


Determine the best model.

In [7]:
# importing relevant libraries
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [8]:
## Perform a gridsearch with SVD
# ⏰ This cell may take several minutes to run
params = {'n_factors': [25, 50, 100],
         'reg_all': [.02, .05, .1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(data)

In [9]:
# print out optimal parameters for SVD after GridSearch
g_s_svd.best_params

{'mae': {'n_factors': 50, 'reg_all': 0.05},
 'rmse': {'n_factors': 50, 'reg_all': 0.05}}

In [10]:
g_s_svd.best_score

{'mae': 0.6682968098904623, 'rmse': 0.8687371811637515}

In [11]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options ={'name': 'pearson', 'user_based': True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [12]:
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.97039311, 0.96992426, 0.97735093, 0.97218108, 0.97321747]))
('test_mae', array([0.75227116, 0.74829009, 0.75281303, 0.75077962, 0.75036814]))
('fit_time', (0.6906659603118896, 0.8347921371459961, 0.6917219161987305, 0.7299394607543945, 0.4389381408691406))
('test_time', (2.524146556854248, 2.445805072784424, 2.436784267425537, 2.356236457824707, 1.2534449100494385))


In [13]:
# print out the average RMSE score for the test set
print(np.mean(cv_knn_basic['test_rmse']))

0.9726133676860197


In [14]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options ={'name': 'pearson', 'user_based': True})
cv_knn_baseline = cross_validate(knn_baseline, data, n_jobs=-1)

In [15]:
# print out the average score for the test set
for i in cv_knn_baseline.items():
    print(i)

('test_rmse', array([0.86469626, 0.87289336, 0.88066641, 0.88115254, 0.88188332]))
('test_mae', array([0.66265635, 0.66645435, 0.67234064, 0.67279629, 0.67257353]))
('fit_time', (0.775249719619751, 1.2807886600494385, 1.1852617263793945, 1.314445972442627, 0.8386940956115723))
('test_time', (3.652033567428589, 3.8382132053375244, 3.9406940937042236, 3.3116025924682617, 2.0067360401153564))


In [16]:
print(np.mean(cv_knn_baseline['test_rmse']))

0.8762583775087153


The best model is SVD with n_factors=50 and regularization rate of 0.05.

Let's make recommendations using movie titles.

In [17]:
df_movies = pd.read_csv('/content/movies.csv')

In [18]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
svd = SVD(n_factors=100, reg_all=0.05)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f69ed5fa950>

In [20]:
svd.predict(3, 6)

Prediction(uid=3, iid=6, r_ui=None, est=3.074022124400607, details={'was_impossible': False})

In [21]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie['title'])
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list

In [22]:
movie_rater(df_movies, 5, 'Adventure')

6805    Hogfather (Terry Pratchett's Hogfather) (2006)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
6573    Tekkonkinkreet (Tekkon kinkurîto) (2006)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
5309    Harold and Kumar Go to White Castle (2004)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
3336    Land Before Time, The (1988)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
7122    Green Lantern: First Flight (2009)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3


[{'movieId': 60818, 'rating': '3', 'userId': 1000},
 {'movieId': 55167, 'rating': '3', 'userId': 1000},
 {'movieId': 8807, 'rating': '4', 'userId': 1000},
 {'movieId': 4519, 'rating': '4', 'userId': 1000},
 {'movieId': 71129, 'rating': '3', 'userId': 1000}]

In [23]:
user_rating = movie_rater(df_movies, 5, 'Adventure')

6879    Futurama: Bender's Game (2008)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
6021    Beowulf & Grendel (2005)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
9246    Camino (2016)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
7182    Battlestar Galactica: The Plan (2009)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
1509    Candleshoe (1977)
Name: title, dtype: object
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2


In [24]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = new_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [25]:
# train a model using the new combined DataFrame
svd = SVD(n_factors=50, reg_all=0.05)
svd.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f69ed5fa790>

In [26]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in new_df['movieId'].unique():
    list_of_movies.append((m_id, svd.predict(1000, m_id)[3]))

In [27]:
# order the predictions from highest to lowest rated

ranked_movies = sorted(list_of_movies, key=lambda x: x[1], reverse=True)

In [28]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation #', idx+1, ': ', title, '\n')
            n -=1
            if n == 0:
                break
            
recommended_movies(ranked_movies,df_movies,5)

Recommendation # 1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation # 2 :  863    Monty Python and the Holy Grail (1975)
Name: title, dtype: object 

Recommendation # 3 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation # 4 :  686    Rear Window (1954)
Name: title, dtype: object 

Recommendation # 5 :  210    Hoop Dreams (1994)
Name: title, dtype: object 



In [29]:

!apt update

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Waiting for headers] [1 [0m[33m0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Waiting for headers] [Co[0m                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Waiting for headers] [Co[0m[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
[33m0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)[0m                                                                               Ign:4 https://develope

In [30]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

In [31]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.8.0-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

In [32]:
import findspark
findspark.init()

In [33]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [34]:
# import necessary libraries
from pyspark import SparkContext
from pyspark.sql import SparkSession

# instantiate SparkSession object
spark = SparkSession.builder.master("local").getOrCreate()

In [36]:
# read in the dataset into pyspark DataFrame
movie_ratings = spark.read.csv('/content/ratings.csv', header='true', inferSchema='true')

In [37]:
def get_mat_sparsity(ratings):
    # Count the total number of ratings in the dataset
    count_nonzero = ratings.select("rating").count()

    # Count the number of distinct userIds and distinct movieIds
    total_elements = ratings.select("userId").distinct().count() * ratings.select("movieId").distinct().count()

    # Divide the numerator by the denominator
    sparsity = (1.0 - (count_nonzero *1.0)/total_elements)*100
    print("The ratings dataframe is ", "%.2f" % sparsity + "% sparse.")
    
get_mat_sparsity(movie_ratings)

The ratings dataframe is  98.30% sparse.


In [39]:
# Group data by userId, count ratings
userId_ratings = movie_ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



In [40]:
# Group data by userId, count ratings
movieId_ratings = movie_ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|     50|  204|
|   2858|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



In [41]:
movie_ratings.dtypes

[('userId', 'int'),
 ('movieId', 'int'),
 ('rating', 'double'),
 ('timestamp', 'int')]

In [42]:
movie_ratings = movie_ratings.drop('timestamp')

In [43]:
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.recommendation import ALS 

# split into training and testing sets
(training, test) = movie_ratings.randomSplit([.8, .2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, rank=4, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')

# fit the ALS model to the training set
model=als.fit(training)

In [44]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(rmse)

1.0032139090218677


In [45]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# initialize the ALS model
als_model = ALS(userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')

# create the parameter grid              
params = ParamGridBuilder().addGrid(als_model.regParam, [.01, .05, .1, .15]).addGrid(als_model.rank, [10, 50, 100, 150]).build()

# instantiating crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params, evaluator=evaluator, parallelism=4)
best_model = cv.fit(movie_ratings)

# We see the best model has a rank of 10, so we will use that in our future models with this dataset
best_model.bestModel.rank

10

In [None]:
movie_titles = spark.read.csv('movies.csv', header='true', inferSchema='true')

movie_titles.head(5)

[Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(movieId=2, title='Jumanji (1995)', genres='Adventure|Children|Fantasy'),
 Row(movieId=3, title='Grumpier Old Men (1995)', genres='Comedy|Romance'),
 Row(movieId=4, title='Waiting to Exhale (1995)', genres='Comedy|Drama|Romance'),
 Row(movieId=5, title='Father of the Bride Part II (1995)', genres='Comedy')]

In [None]:
def name_retriever(movie_id, movie_title_df):
    return movie_title_df.where(movie_title_df.movieId == movie_id).take(1)[0]['title']

In [None]:
print(name_retriever(1022, movie_titles))

Cinderella (1950)


In [None]:
users = movie_ratings.select(als.getUserCol()).distinct().limit(1)
userSubsetRecs = model.recommendForUserSubset(users, 10)
recs = userSubsetRecs.take(1)

In [None]:
# use indexing to obtain the movie id of top predicted rated item
first_recommendation = recs[0]['recommendations'][0][0]

# use the name retriever function to get the values
name_retriever(first_recommendation,movie_titles)

'Farewell My Concubine (Ba wang bie ji) (1993)'

In [None]:
recommendations = model.recommendForAllUsers(5)
recommendations.where(recommendations.userId == 3).collect()

[Row(userId=3, recommendations=[Row(movieId=4248, rating=8.758819580078125), Row(movieId=3223, rating=7.480904579162598), Row(movieId=99764, rating=7.315658092498779), Row(movieId=93563, rating=6.8287787437438965), Row(movieId=1468, rating=6.746992111206055)])]

In [None]:
def new_user_recs(user_id, new_ratings, rating_df, movie_title_df, num_recs):
    # turn the new_recommendations list into a spark DataFrame
  new_user_ratings = spark.createDataFrame(new_ratings, rating_df.columns)
    
    # combine the new ratings df with the rating_df
  movie_ratings_combined = rating_df.union(new_user_ratings)
    
    # create an ALS model and fit it
  als = ALS(maxIter=5, rank=50, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')
  model = als.fit(movie_ratings_combined)
    
    # make recommendations for all users using the recommendForAllUsers method
  recommendations = model.recommendForAllUsers(num_recs)
    
    # get recommendations specifically for the new user that has been added to the DataFrame
  recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)

  for ranking, (movie_id, rating) in enumerate(recs_for_user[0]['recommendations']):
    movie_string = name_retriever(movie_id, movie_title_df)
    print('Recommendation {}: {} | predicted score: {}'.format(ranking+1, movie_string, rating))

In [None]:
# try out your function with the movies listed above
user_id = 1000
user_ratings_1 = [(user_id, 3000, 5), (user_id, 2500, 5), (user_id, 2000, 4), (user_id, 1757, 6)]
new_user_recs(user_id, user_ratings_1, movie_ratings, movie_titles, 10)

Recommendation 1: Fallen Angels (Duo luo tian shi) (1995) | predicted score: 5.992415428161621
Recommendation 2: Clockwork Orange, A (1971) | predicted score: 5.332775115966797
Recommendation 3: Hustler, The (1961) | predicted score: 5.221590518951416
Recommendation 4: Manhattan (1979) | predicted score: 5.212000846862793
Recommendation 5: Monty Python and the Holy Grail (1975) | predicted score: 5.210682392120361
Recommendation 6: Blue Velvet (1986) | predicted score: 5.173949718475342
Recommendation 7: L.A. Confidential (1997) | predicted score: 5.124312400817871
Recommendation 8: Seven Samurai (Shichinin no samurai) (1954) | predicted score: 5.099935531616211
Recommendation 9: Ran (1985) | predicted score: 5.080130100250244
Recommendation 10: Godfather, The (1972) | predicted score: 5.003494739532471
