In [1]:
import pandas as pd
import numpy as np

from IPython.display import display
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

### Data Collection and Preprocessing

We load our datasets

In [2]:
%%bash
if [ ! -d "../data/ml-1m" ]; 
then
    mkdir -p ../data
    wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
    wget https://datasets.imdbws.com/title.basics.tsv.gz
    wget https://datasets.imdbws.com/name.basics.tsv.gz
    unzip -o ml-1m.zip -d ../data;
    gunzip name.basics.tsv.gz
    gunzip title.basics.tsv.gz 
    mv *.tsv ../data/
    rm -rf ml-1m.zip
else
    echo "Data already downloaded";
fi

--2024-07-07 20:19:32--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’

     0K .......... .......... .......... .......... ..........  0%  235K 24s
    50K .......... .......... .......... .......... ..........  1%  466K 18s
   100K .......... .......... .......... .......... ..........  2% 92.3M 12s
   150K .......... .......... .......... .......... ..........  3% 43.9M 9s
   200K .......... .......... .......... .......... ..........  4%  471K 9s
   250K .......... .......... .......... .......... ..........  5% 27.4M 8s
   300K .......... .......... .......... .......... ..........  6% 88.3M 7s
   350K .......... .......... .......... .......... ..........  6% 44.3M 6s
   400K .......... .......... .......

Archive:  ml-1m.zip
   creating: ../data/ml-1m/
  inflating: ../data/ml-1m/movies.dat  
  inflating: ../data/ml-1m/ratings.dat  
  inflating: ../data/ml-1m/README    
  inflating: ../data/ml-1m/users.dat  


In [3]:
df_movies = pd.read_csv("../data/ml-1m/movies.dat", engine="python", encoding="ISO-8859-1", delimiter='::', header=None)
df_ratings = pd.read_csv("../data/ml-1m/ratings.dat", engine="python", encoding="ISO-8859-1", delimiter='::', header=None)
df_nameBasic = pd.read_csv("../data/name.basics.tsv", sep='\t')
df_title = pd.read_csv("../data/title.basics.tsv", sep='\t')

  df_title = pd.read_csv("../data/title.basics.tsv", sep='\t')


In [4]:
display(df_movies.head(3), df_ratings.head(3), df_nameBasic.head(3), df_title.head(3))

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"


In [5]:
df_movies.columns = ["movieId", "title", "genres"]
df_title.rename(columns={'originalTitle': "title"}, inplace=True)

df_ratings.columns = ["userId", "movieId", "rating", "timestamp"]

In [6]:
df_merged = pd.merge(df_movies, df_title, on="title", how="left")
df_merged.fillna(0,inplace=True)
df_merged.head(3)

  df_merged.fillna(0,inplace=True)


Unnamed: 0,movieId,title,genres_x,tconst,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres_y
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,tt11707316,tvEpisode,Jumanji (1995),0,2014,\N,\N,History
2,2,Jumanji (1995),Adventure|Children's|Fantasy,tt15206184,tvEpisode,Jumanji (1995),0,2018,\N,44,Comedy


For this project, with the metadata we have we could do a content based Recommender system by useing the actors of the movies, runtime minutes and genres to come up with predictions. But in this project, i will build a collaborative filter system based on single value decomposition.


### Feature Engineering

Here we create our user-item interaction matrix necessary for our Regular SVD

In [7]:
user_item_matrix = df_ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# compute sparsity
total_entries = user_item_matrix.size
non_zero_entries = user_item_matrix.astype(bool).sum().sum()
zero_entries = total_entries - non_zero_entries
sparsity = (zero_entries / total_entries) * 100

print(f"Sparsity: {sparsity:.2f}%")

Sparsity: 95.53%


For our recommender system we will be using collaborative filtering based on single value decomposition

### Model Development

First intuition: simple SVD

In [9]:
U, S, Vt = np.linalg.svd(user_item_matrix, full_matrices=False)
Sigma = np.diag(S)

# Compute the predicted ratings by multiplying the U, Sigma, and Vt matrices
predicted_ratings = np.dot(np.dot(U, Sigma), Vt)

predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

In [10]:
display(predicted_ratings_df.head(5), user_item_matrix.head(5))

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.329483e-13,-1.536145e-15,-1.258468e-13,3.289426e-14,6.981679e-14,4.688821e-14,-6.424483e-14,-1.752778e-14,5.208445e-14,...,5.087754e-16,3.15286e-16,-2.431649e-15,-9.820974e-16,-2.206677e-15,-3.037918e-14,-1.435799e-14,-2.520499e-15,2.40617e-15,-1.68075e-14
2,-1.33741e-14,-4.186622e-13,-9.017998e-14,1.58659e-13,-4.509312e-15,-6.197273e-14,1.223047e-14,-1.849812e-15,-3.008691e-14,2.03897e-14,...,1.68214e-16,2.046974e-16,-1.349073e-15,1.976907e-15,-2.006153e-15,-3.837473e-15,-2.724461e-15,9.923702e-16,2.193829e-15,-5.695165e-15
3,-1.034112e-15,-1.76449e-14,-3.109112e-14,-4.073727e-15,5.780456e-14,9.6548e-15,-3.229005e-14,-1.71404e-14,2.978668e-14,-3.453672e-15,...,5.513168000000001e-17,-5.61183e-16,-4.8897520000000006e-17,2.238918e-15,1.632483e-15,1.005191e-16,4.438886e-15,-1.098297e-16,-8.469787e-16,8.285351e-15
4,2.957602e-15,2.636118e-14,-1.29995e-14,1.234498e-13,8.003354e-14,-3.014285e-14,4.579643e-15,-1.2374e-14,-1.659846e-14,4.77833e-15,...,1.236316e-15,6.601707e-16,-3.538836e-16,-1.023107e-15,-8.852511e-16,9.929677e-15,8.738212e-16,-3.96818e-17,2.341714e-15,1.539825e-15
5,-1.39608e-14,3.3585e-14,1.151077e-14,-2.899319e-15,-5.205065e-14,2.0,1.233818e-14,-4.633338e-15,-3.528563e-15,-3.775563e-14,...,-1.03216e-16,1.513546e-16,-1.140743e-15,-1.099489e-15,-2.021929e-15,9.105875e-15,1.088531e-14,2.488596e-15,3.737245e-15,7.627904e-16


movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We can see that a regular SVD can fill missing values (represented by 0) but in our case where our data is extremely sparse\
the reconstructed Matrix doens't represent much.\
so we need a more complexe algorithm based on SVD and matrix factorization to handle sparse data more efficiently.

### Simon Funk SVD
For the Model, I will be using the surprise library which proposes an implementation of the famous SVD algorithm by Simon Funk.\
where the goal of Funk SVD is to decompose the user-item matrix into two lower-dimensional matrices.\
This model is designed to work with sparse matrices

In [11]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [12]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

#train our SVD model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb33d0e8090>

In [13]:
test_pred = model.test(testset)
accuracy.rmse(test_pred)

RMSE: 0.8780


0.8780053395300095

The RMSE of our model is less than one so the predictions are accurate enough for our recommendation system

Now that we have our model we will use we need to find a way to recommend movies for two users. For this we will:
- generate predictions for two given user
- predict only the movies that he has not seen (because i believe that it is most of the times better to watch something new with someone)
- generate a sort of couple scoring (a simple average between both predicted score) from the predictions and give the one with the highest predicted score. 

### Recommendation Algorithm

In [14]:
def recommend_movies_couple(user1, user2, model, df_movies, df_ratings, top_n=5):
    # movie_ids = df_movies['movieId'].unique()
    predictions = []

    for movie_id in df_movies['movieId'].values:
        # Check if both users have not rated the movie
        if (df_ratings[(df_ratings['userId'] == user1) & (df_ratings['movieId'] == movie_id)].empty and
            df_ratings[(df_ratings['userId'] == user2) & (df_ratings['movieId'] == movie_id)].empty):
            
            pred_user1 = model.predict(user1, movie_id).est
            pred_user2 = model.predict(user2, movie_id).est
            # Calculate the average predicted rating for both users 'couple score'
            couple_score = (pred_user1 + pred_user2) / 2
            predictions.append((movie_id, couple_score))

    # Sort the predictions by average estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get the top N recommendations
    recommended_movie_ids = [pred[0] for pred in predictions[:top_n]]
    
    # Display the recommended movies
    recommended_movies = df_movies[df_movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies[["title"]], predictions[:top_n]

In [15]:
user1 = 42
user2 = 101
nb_movies = 10

recommended_movies_couple, prediction_movies_couple = recommend_movies_couple(user1, user2, model, df_merged, df_ratings, nb_movies)
print(recommended_movies_couple)

                                           title
688                       Pather Panchali (1955)
690       World of Apu, The (Apur Sansar) (1959)
761                        Close Shave, A (1995)
2364  Life Is Beautiful (La Vita è bella) (1997)
2551                     Apple, The (Sib) (1998)
2841                     Sixth Sense, The (1999)
2987                              Sanjuro (1962)
3167                                42 Up (1998)
3562                          Dersu Uzala (1974)
3767                     Romeo and Juliet (1968)


### Improvements

We could try to combine both content and collaborative filtering to build our system recommender model.\
The matrix factorization is demanding computationally, we can try other matrix factorization techniques such as Alternating Least Square.