# Practice PS06: Recommendations engines (interactions-based)

<font size="+2" color="blue">Additional results: surprise library</font>

Author: <font color="blue">Manvir Kaur Singh</font>

E-mail: <font color="blue">manvir.kaur01@estudiant.upf.edu</font>

Date: <font color="blue">15/11/2023</font>

# 1. The Movies dataset

# 1.1. Load the input files

In [1]:
# Leave this code as-is

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from math import*
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Leave this code as-is

FILENAME_MOVIES = "movies-2000s.csv"
FILENAME_RATINGS = "ratings-2000s.csv"
FILENAME_TAGS = "tags-2000s.csv"

In [3]:
# Leave this code as-is

movies = pd.read_csv(FILENAME_MOVIES, 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
display(movies.head(5))

ratings_raw = pd.read_csv(FILENAME_RATINGS, 
                    sep=',', 
                    encoding='latin-1',
                    engine='python',
                    names=['user_id', 'movie_id', 'rating'])
display(ratings_raw.head(5))

Unnamed: 0,movie_id,title,genres
0,2769,"Yards, The (2000)",Crime|Drama
1,3177,Next Friday (2000),Comedy
2,3190,Supernova (2000),Adventure|Sci-Fi|Thriller
3,3225,Down to You (2000),Comedy|Romance
4,3228,Wirey Spindell (2000),Comedy


Unnamed: 0,user_id,movie_id,rating
0,4,1,3.0
1,4,260,3.5
2,4,296,4.0
3,4,541,4.5
4,4,589,4.0


# 1.2. Merge the data into a single dataframe

In [4]:
ratings = pd.merge(movies, ratings_raw, how='inner', on='movie_id')
display(ratings.head())

Unnamed: 0,movie_id,title,genres,user_id,rating
0,2769,"Yards, The (2000)",Crime|Drama,1115,4.0
1,2769,"Yards, The (2000)",Crime|Drama,1209,2.0
2,2769,"Yards, The (2000)",Crime|Drama,2004,3.0
3,2769,"Yards, The (2000)",Crime|Drama,2502,4.0
4,2769,"Yards, The (2000)",Crime|Drama,2827,4.0


In [5]:
# LEAVE AS-IS

# For testing, this should print:
# movie_id:  4993, title: Lord of the Rings: The Fellowship of the Ring, The (2001)
# movie_id:  5952, title: Lord of the Rings: The Two Towers, The (2002)
# movie_id:  7153, title: Lord of the Rings: The Return of the King, The (2003)
def find_movies(word, movies):
    for index, row in movies.iterrows():
        if word in row['title']:
            print("movie_id: ", row['movie_id'], "title: ", row['title'])

find_movies("Lord of the Rings", movies)

movie_id:  4993 title:  Lord of the Rings: The Fellowship of the Ring, The (2001)
movie_id:  5952 title:  Lord of the Rings: The Two Towers, The (2002)
movie_id:  7153 title:  Lord of the Rings: The Return of the King, The (2003)


In [6]:
# LEAVE AS-IS

def get_title(movie_id, movies):
    return movies[movies['movie_id'] == movie_id].title.iloc[0]

In [7]:
# LEAVE AS-IS

# For testing, should print "Lord of the Rings: The Return of the King, The (2003)")
print(get_title(7153, movies))

Lord of the Rings: The Return of the King, The (2003)


## 1.3. Count unique registers

In [8]:
print(len(ratings['user_id'].unique()))
print(len(ratings['title'].unique()))
print(len(movies['movie_id'].unique()))

12676
2049
33168


# 2. Item-based Collaborative Filtering

## 2.1. Data pre-processing

In [9]:
rated_movies = ratings.drop(['genres'], axis=1)
rated_movies.head(10)

Unnamed: 0,movie_id,title,user_id,rating
0,2769,"Yards, The (2000)",1115,4.0
1,2769,"Yards, The (2000)",1209,2.0
2,2769,"Yards, The (2000)",2004,3.0
3,2769,"Yards, The (2000)",2502,4.0
4,2769,"Yards, The (2000)",2827,4.0
5,2769,"Yards, The (2000)",6629,1.0
6,2769,"Yards, The (2000)",12435,4.0
7,2769,"Yards, The (2000)",13873,3.0
8,2769,"Yards, The (2000)",14799,3.0
9,2769,"Yards, The (2000)",15691,2.5


In [10]:
ratings_summary = rated_movies[['movie_id', 'title']].copy()

ratings_summary = ratings_summary.groupby('movie_id').first()

ratings_summary['ratings_mean'] = rated_movies.groupby('movie_id')['rating'].mean()
ratings_summary['ratings_count'] = rated_movies.groupby('movie_id')['user_id'].count()

ratings_summary.head()

Unnamed: 0_level_0,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2769,"Yards, The (2000)",3.122549,102
3177,Next Friday (2000),2.824,125
3190,Supernova (2000),2.395683,139
3225,Down to You (2000),2.577273,110
3228,Wirey Spindell (2000),2.5,2


In [11]:
popular = ratings_summary[ratings_summary.ratings_count>=2500]
top_rated = popular.sort_values(by='ratings_mean', ascending=False)

print("Top 5 highest-rated movies with at least 2500 ratings:")
top_rated.head(5)

Top 5 highest-rated movies with at least 2500 ratings:


Unnamed: 0_level_0,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4226,Memento (2000),4.158512,4476
4973,"Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le)...",4.097234,3687
4993,"Lord of the Rings: The Fellowship of the Ring,...",4.09253,5944
7153,"Lord of the Rings: The Return of the King, The...",4.08396,5449
5952,"Lord of the Rings: The Two Towers, The (2002)",4.083869,5449


In [12]:
popular = ratings_summary[ratings_summary.ratings_count>=3]
top_rated = popular.sort_values(by='ratings_mean', ascending=False)

print("Top 5 highest-rated movies with at least 2500 ratings:")
top_rated.head(5)

Top 5 highest-rated movies with at least 2500 ratings:


Unnamed: 0_level_0,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5082,"Rumor of Angels, A (2000)",4.666667,6
27764,2LDK (2003),4.5,3
31954,Beautiful City (Shah-re ziba) (2004),4.4,5
5224,Promises (2001),4.388889,18
6775,Life and Debt (2001),4.333333,3


<font size="+1" color="red">What is the difference? How would you explain this?</font>


The average ratings of the top 5 movies can differ. If we lower the threshold for the minimum number of ratings a movie must have, the average rating tends to increase. This is because movies with very few ratings can be highly influenced by individual opinions, leading to a biased average. On the contrary, when a movie receives ratings from a large number of users, the average becomes more robust and trustworthy.

## 2.2. Compute the user-movie matrix

In [13]:
user_movie = rated_movies.pivot_table(index = 'user_id', columns = 'movie_id', values = 'rating')
user_movie.head()



movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,
62,,,,,,,,4.5,,,...,,,,,,,,,,3.5
63,,,,,,,,,,,...,,,,,,,,,,
95,,,,,,,,3.5,,,...,,,,,,,,,,


<font size="+1" color="red">Brief commentary indicating why do you think the "user_movie" matrix has so many "NaN" values. How do we call this characteristic of user ratings in recommender systems?</font>

Since a user is likely to have watched only a small fraction of the total number of movies available, the matrix representing user-movie interactions is characterized as sparse.

# 2.3. Explore some correlations in the user-movie matrix

In [14]:
get_title(4993, rated_movies)

'Lord of the Rings: The Fellowship of the Ring, The (2001)'

In [15]:
id_pivot = rated_movies.loc[rated_movies['title'] == 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'movie_id'].iloc[0]
id_m1 = rated_movies.loc[rated_movies['title'] == 'Finding Nemo (2003)', 'movie_id'].iloc[0]
id_m2 = rated_movies.loc[rated_movies['title'] == 'Talk to Her (Hable con Ella) (2002)', 'movie_id'].iloc[0]

s1 = user_movie[id_pivot].dropna()
s2 = user_movie[id_m1].dropna()
s3 = user_movie[id_m2].dropna()

ratings3 = pd.concat([s1,s2,s3], axis=1)

ratings3.dropna(inplace=True)

ratings3.head(10)

Unnamed: 0_level_0,4993,6377,5878
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
859,3.0,4.0,5.0
1229,4.0,4.0,4.5
1281,3.0,2.5,3.0
1722,5.0,4.5,4.0
2004,4.5,3.0,3.5
4590,4.0,4.0,2.0
5052,2.0,4.0,4.0
5144,5.0,5.0,5.0
6497,3.5,3.5,3.5
8369,3.0,4.0,4.5


In [16]:
print(ratings3[id_pivot].corr(ratings3[id_m1]))
print(ratings3[id_pivot].corr(ratings3[id_m2]))
print(ratings3[id_m1].corr(ratings3[id_m2]))

0.3840549071566764
0.16240502267155424
0.2042645045941218


<font size="+1" color="red">Brief commentary on the correlations you find.</font>

The highest similarity is observed between "Lord of the Rings" and "Finding Nemo," which may be attributed to the adventurous elements in both films, despite their distinct differences.

Following closely, "Finding Nemo" and "Talk to Her" exhibit the next highest similarity score, likely influenced by the dramatic aspects shared between these two films.

In contrast, "Lord of the Rings" and "Talk to Her" appear to lack significant commonalities, explaining the lower similarity score of 0.16.

In [17]:
correlations = user_movie.corrwith(user_movie[id_pivot], axis=0, drop=False, method='pearson', numeric_only=False)
correlations = correlations.dropna()
display(correlations)

#sorted
similar_to_pivot = correlations.sort_values(ascending=False)
display(similar_to_pivot)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


movie_id
2769    -0.127515
3177     0.093221
3190     0.041206
3225     0.126600
3239     0.338378
           ...   
33154    0.318255
33158    0.228214
33162    0.285377
33164    0.037130
33166    0.197344
Length: 1868, dtype: float64

movie_id
8691     1.0
27742    1.0
32788    1.0
5819     1.0
32935    1.0
        ... 
6292    -1.0
8837    -1.0
31610   -1.0
6965    -1.0
5467    -1.0
Length: 1868, dtype: float64

In [18]:
corr_with_pivot = pd.DataFrame(similar_to_pivot, columns=['corr'])
corr_with_pivot['title'] = ratings_summary['title']
corr_with_pivot['ratings_mean'] = ratings_summary['ratings_mean']
corr_with_pivot['ratings_count'] = ratings_summary['ratings_count']

popular_movies = corr_with_pivot[corr_with_pivot['ratings_count'] > 500]

top_10_corr_movies = popular_movies.sort_values(by='corr', ascending=False).head(10)

top_10_corr_movies.head(10)

Unnamed: 0_level_0,corr,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4993,1.0,"Lord of the Rings: The Fellowship of the Ring,...",4.09253,5944
5952,0.892103,"Lord of the Rings: The Two Towers, The (2002)",4.083869,5449
7153,0.892073,"Lord of the Rings: The Return of the King, The...",4.08396,5449
6539,0.377599,Pirates of the Caribbean: The Curse of the Bla...,3.779241,3950
8368,0.340934,Harry Potter and the Prisoner of Azkaban (2004),3.809971,2397
3578,0.337667,Gladiator (2000),3.95105,4811
3793,0.329686,X-Men (2000),3.556436,3535
4896,0.31918,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.678509,2843
3624,0.307471,Shanghai Noon (2000),3.297443,1017
31658,0.303898,Howl's Moving Castle (Hauru no ugoku shiro) (2...,4.064417,1141


<font size="+1" color="red">Brief commentary about the movies you see on this list. What happens if you set the condition on *ratings_count* to a much larger value? What happens if you set it to a much smaller value?</font>


Correlated films are films of the same type. If we set ratings_count to a much higher value the result will be more reliable because we will have a larger sample. And if we set it to a smaller value it won't be as accurate as the sample size will be small.

# 2.4. Implement the item-based recommendations

In [19]:
item_similarity = user_movie.corr()

item_similarity.head(10)

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,1.0,0.115068,0.033721,-0.232268,,-0.5,0.197011,0.199514,0.250873,,...,0.37998,0.87831,,,,0.248126,0.1806095,-0.08557,-0.408248,0.105671
3177,0.115068,1.0,0.30382,0.559533,,,0.331191,0.167918,1.0,,...,0.546119,0.735767,-1.0,,,-0.221382,0.3174747,0.014735,0.661989,0.185654
3190,0.033721,0.30382,1.0,0.636361,,-0.014315,0.146042,0.394293,-0.290397,,...,0.246183,0.632026,,,,0.378181,0.1709261,0.022444,-0.07336,-0.054114
3225,-0.232268,0.559533,0.636361,1.0,,0.578414,0.347716,0.263671,-0.250313,,...,-0.300376,0.318377,,,,0.480173,0.7503063,0.536828,0.753141,0.098748
3228,,,,,1.0,,,,,,...,,,,,,,,,,
3239,-0.5,,-0.014315,0.578414,,1.0,0.180846,1.0,,,...,,,,,,1.0,,1.0,0.636285,0.8882
3273,0.197011,0.331191,0.146042,0.347716,,0.180846,1.0,0.105735,0.154371,,...,0.006774,0.409968,1.0,,,0.088405,0.07516779,0.143492,0.466705,0.084202
3275,0.199514,0.167918,0.394293,0.263671,,1.0,0.105735,1.0,0.485071,,...,-0.011426,0.279624,,,,0.075827,0.2994603,0.187713,0.285584,0.225317
3276,0.250873,1.0,-0.290397,-0.250313,,,0.154371,0.485071,1.0,,...,,0.29277,,,,0.0,-6.885311000000001e-17,-0.45553,0.5,-0.138013
3279,,,,,,,,,,1.0,...,,,,,,,,,,


In [20]:
item_similarity_min_ratings = user_movie.corr(min_periods=100)
item_similarity_min_ratings.head()

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,1.0,,,,,,,,,,...,,,,,,,,,,
3177,,1.0,,,,,,,,,...,,,,,,,,,,
3190,,,1.0,,,,,,,,...,,,,,,,,,,
3225,,,,1.0,,,,,,,...,,,,,,,,,,
3228,,,,,,,,,,,...,,,,,,,,,,


In [21]:
user_id_super = user_movie[(user_movie[5349]>4.5) & (user_movie[3793]>4.5) & (user_movie[6534]>4.5)].index[0]
print(user_id_super)

user_id_drama = user_movie[(user_movie[6870]>4.5) & (user_movie[5995]>4.5) & (user_movie[3555]>4.5)].index[0]
print(user_id_drama)

127342
34336


In [22]:
# Leave this code as-is

# Gets a list of watched movies for a user_id
def get_watched_movies(user_id, user_movie):
    return list(user_movie.loc[user_id].dropna().sort_values(ascending=False).index)
    
# Gets the rating a user_id has given to a movie_id
def get_rating(user_id, movie_id, user_movie):
    return user_movie[movie_id][user_id]

# Print watched movies
def print_watched_movies(user_id, user_movie, movies):
    for movie_id in get_watched_movies(user_id, user_movie):
        print("%d %.1f %s " %
          (movie_id, get_rating(user_id, movie_id, user_movie), get_title(movie_id, movies)))


In [23]:
# LEAVE AS-IS (TESTING CODE)

print_watched_movies(user_id_super, user_movie, movies)

5502 5.0 Signs (2002) 
5445 5.0 Minority Report (2002) 
6156 5.0 Shanghai Knights (2003) 
5952 5.0 Lord of the Rings: The Two Towers, The (2002) 
5944 5.0 Star Trek: Nemesis (2002) 
5816 5.0 Harry Potter and the Chamber of Secrets (2002) 
5618 5.0 Spirited Away (Sen to Chihiro no kamikakushi) (2001) 
5524 5.0 Blue Crush (2002) 
5480 5.0 Stuart Little 2 (2002) 
5459 5.0 Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002) 
5420 5.0 Windtalkers (2002) 
4388 5.0 Scary Movie 2 (2001) 
5389 5.0 Spirit: Stallion of the Cimarron (2002) 
5349 5.0 Spider-Man (2002) 
5218 5.0 Ice Age (2002) 
5064 5.0 The Count of Monte Cristo (2002) 
4993 5.0 Lord of the Rings: The Fellowship of the Ring, The (2001) 
4973 5.0 Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001) 
4896 5.0 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) 
4886 5.0 Monsters, Inc. (2001) 
6186 5.0 Gods and Generals (2003) 
6333 5.0 X2: X-Men United (2003) 
6377 5.0 Finding Nemo (2003) 
6

In [24]:
# LEAVE AS-IS (TESTING CODE)

print_watched_movies(user_id_drama, user_movie, movies)

3967 5.0 Billy Elliot (2000) 
4014 5.0 Chocolat (2000) 
4034 5.0 Traffic (2000) 
5995 5.0 Pianist, The (2002) 
7147 5.0 Big Fish (2003) 
4995 5.0 Beautiful Mind, A (2001) 
3555 5.0 U-571 (2000) 
6870 5.0 Mystic River (2003) 
5991 5.0 Chicago (2002) 
8464 5.0 Super Size Me (2004) 
5669 5.0 Bowling for Columbine (2002) 
8622 5.0 Fahrenheit 9/11 (2004) 
30707 5.0 Million Dollar Baby (2004) 
6953 4.5 21 Grams (2003) 
5015 4.5 Monster's Ball (2001) 
5464 4.5 Road to Perdition (2002) 
3510 4.5 Frequency (2000) 
5989 4.5 Catch Me If You Can (2002) 
4022 4.0 Cast Away (2000) 
5010 4.0 Black Hawk Down (2001) 
5299 4.0 My Big Fat Greek Wedding (2002) 
3897 4.0 Almost Famous (2000) 
3755 4.0 Perfect Storm, The (2000) 
4308 4.0 Moulin Rouge (2001) 
4447 3.5 Legally Blonde (2001) 
4246 3.5 Bridget Jones's Diary (2001) 
4975 3.5 Vanilla Sky (2001) 
4019 3.5 Finding Forrester (2000) 
5377 3.5 About a Boy (2002) 
3948 3.5 Meet the Parents (2000) 
5956 3.0 Gangs of New York (2002) 
6281 3.0 Phone Booth

In [25]:
def get_movies_relevance(user_id, user_movie, item_similarity_matrix):
    
    # Create an empty series
    movies_relevance = pd.Series()
    
    # Iterate through the movies the user has watched
    for watched_movie in get_watched_movies(user_id, user_movie):
        
        # Obtain the rating given
        rating_given = get_rating(user_id, watched_movie, user_movie)
        
        # Obtain the vector containing the similarities of watched_movie
        # with all other movies in item_similarity_matrix
        similarities = item_similarity_matrix[watched_movie]
        
        # Multiply this vector by the given rating
        weighted_similarities = similarities*rating_given
        
        # Append these terms to movies_relevance
        #movies_relevance = movies_relevance.append(weighted_similarities)
        movies_relevance = pd.concat([movies_relevance, weighted_similarities])
    # Compute the sum for each movie
    movies_relevance = movies_relevance.groupby(movies_relevance.index).sum()
    
    # Convert to a dataframe
    movies_relevance_df = pd.DataFrame(movies_relevance, columns=['relevance'])
    movies_relevance_df['movie_id'] = movies_relevance_df.index
    
    return movies_relevance_df

In [26]:
superhero = get_movies_relevance(user_id_super, user_movie, item_similarity_min_ratings)

mysuperhero = pd.merge(movies, superhero, how='inner', on='movie_id')
mysuperhero.sort_values(by='relevance', ascending=False).head()

Unnamed: 0,movie_id,title,genres,relevance
1472,8644,"I, Robot (2004)",Action|Adventure|Sci-Fi|Thriller,189.170085
663,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi,181.63812
85,3753,"Patriot, The (2000)",Action|Drama|War,176.650945
1414,8361,"Day After Tomorrow, The (2004)",Action|Adventure|Drama|Sci-Fi|Thriller,172.899804
310,4310,Pearl Harbor (2001),Action|Drama|Romance|War,172.700877


In [27]:
drama = get_movies_relevance(user_id_drama, user_movie, item_similarity)

mydrama = pd.merge(movies, drama, how='inner', on='movie_id')
mydrama.sort_values(by='relevance', ascending=False).head()

Unnamed: 0,movie_id,title,genres,relevance
1376,7521,Mercy (2000),Crime|Mystery|Thriller,160.0
351,4449,Adanggaman (2000),Drama,154.388241
1357,7443,This So-Called Disaster (2003),Documentary,146.447098
1930,31636,"Bunker, The (2001)",Drama|Horror|Mystery|Thriller|War,135.966211
1823,27835,"Agronomist, The (2003)",Documentary,133.5


<font size="+1" color="red">Brief commentary on the movies you see on these lists. How many of them look relevant for the intended users? Feel free to use IMDB or Wikipedia to get info on these movies.</font>

After checking for more details about these movies, we can say that these recommendations are good. They are alike in the type of movies they are. They have the same kinds of genres, and people seem to rate them similarly.

In [28]:
def get_recommended_movies(user_id, user_movie, item_similarity_matrix):
    
    relevant_movies = get_movies_relevance(user_id, user_movie, item_similarity_matrix)
    
    relevant_movies.set_index('movie_id', inplace=True)

    movies_watched = get_watched_movies(user_id, user_movie)

    relevant_movies.drop(movies_watched)
    
    return relevant_movies

In [29]:
relevant_movies_df = get_recommended_movies(user_id_super, user_movie,  item_similarity_min_ratings)

recommended_movies_super = pd.merge(movies, relevant_movies_df, how = 'inner', on = 'movie_id')
recommended_movies_super.sort_values(by='relevance', ascending=False).head(5)

Unnamed: 0,movie_id,title,genres,relevance
1472,8644,"I, Robot (2004)",Action|Adventure|Sci-Fi|Thriller,189.170085
663,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi,181.63812
85,3753,"Patriot, The (2000)",Action|Drama|War,176.650945
1414,8361,"Day After Tomorrow, The (2004)",Action|Adventure|Drama|Sci-Fi|Thriller,172.899804
310,4310,Pearl Harbor (2001),Action|Drama|Romance|War,172.700877


In [30]:
relevant_movies_df = get_recommended_movies(user_id_drama, user_movie,  item_similarity_min_ratings)

recommended_movies_super = pd.merge(movies, relevant_movies_df, how = 'inner', on = 'movie_id')
recommended_movies_super.sort_values(by='relevance', ascending=False).head(5)

Unnamed: 0,movie_id,title,genres,relevance
1572,8958,Ray (2004),Drama,65.46137
195,4019,Finding Forrester (2000),Drama,63.007635
1055,6565,Seabiscuit (2003),Drama,61.354376
501,4995,"Beautiful Mind, A (2001)",Drama|Romance,61.21305
508,5014,I Am Sam (2001),Drama,61.209632


<font size="+1" color="red">Brief commentary on these recommendations. Do you think they are relevant? Why or why not? After removing the movies the user has already watched, are the relevance scores of the remaining items comparable to the previous lists that contained all relevant movies?</font>

These recommendations are really good. When we look at the super ID, we can see that the most relevant movies have the same genres as the original movie. Just from the titles, we can tell that the person will likely enjoy those films.

>OPTIONAL

In [31]:
conda install -c conda-forge scikit-surprise

done
Solving environment: done


  current version: 23.7.4
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [32]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25)

algo = SVD()

# train
algo.fit(trainset)

# predictions for test set
predictions = algo.test(testset)

# accuracy
accuracy.rmse(predictions)

# top N recommendations for a specific user
def top_n_recommendations(predictions, user_id, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for the specified user and get the top n
    if user_id in top_n:
        top_n[user_id].sort(key=lambda x: x[1], reverse=True)
        top_n[user_id] = top_n[user_id][:n]

    return top_n[user_id] if user_id in top_n else []

user_id_super = 127342

user_recommendations = top_n_recommendations(predictions, user_id_super, n=10)
print("Recommendations for:", user_id_super)
for movie_id, predicted_rating in user_recommendations:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating:.2f}")


RMSE: 0.8406
Recommendations for: 127342
Movie ID: 4886, Predicted Rating: 5.00
Movie ID: 4995, Predicted Rating: 5.00
Movie ID: 8368, Predicted Rating: 5.00
Movie ID: 6377, Predicted Rating: 4.94
Movie ID: 4973, Predicted Rating: 4.92
Movie ID: 4896, Predicted Rating: 4.87
Movie ID: 33162, Predicted Rating: 4.84
Movie ID: 8360, Predicted Rating: 4.81
Movie ID: 4299, Predicted Rating: 4.79
Movie ID: 8972, Predicted Rating: 4.75


In [33]:
user_id_drama = 34336

user_recommendations = top_n_recommendations(predictions, user_id_drama, n=10)
print("Recommendations for:", user_id_drama)
for movie_id, predicted_rating in user_recommendations:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating:.2f}")

Recommendations for: 34336
Movie ID: 6953, Predicted Rating: 3.83
Movie ID: 4027, Predicted Rating: 3.83
Movie ID: 5989, Predicted Rating: 3.79
Movie ID: 4014, Predicted Rating: 3.75
Movie ID: 5418, Predicted Rating: 3.64
Movie ID: 8644, Predicted Rating: 3.62
Movie ID: 4246, Predicted Rating: 3.61
Movie ID: 7147, Predicted Rating: 3.57
Movie ID: 5991, Predicted Rating: 3.56
Movie ID: 7153, Predicted Rating: 3.43


<font size="+2" color="#003300">I hereby declare that, except for the code provided by the course instructors, all of my code, report, and figures were produced by myself.</font>