Collaborative Filtering via Matrix Factorization. Future comparisons between efficacy and time complexity when integrating differential privacy

In [1]:
import os
import numpy as np
import pandas as pd
from diffprivlib.mechanisms import Laplace
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from scipy.sparse.linalg import svds

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
ratings_list = [i.strip().split("::") for i in open('ml-1m/ratings.dat', 'r').readlines()]
users_list = [i.strip().split("::") for i in open('ml-1m/users.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('ml-1m/movies.dat', 'r').readlines()]
#movies_list = pd.read_csv("ml-latest-small/movies.csv")
#movies_df = pd.read_csv("ml-latest-small/movies.csv")
#movies_df = pd.read_csv("ml-32m/movies.csv")

ratings = np.array(ratings_list)
users = np.array(users_list)
movies = np.array(movies_list)

ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = float)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

def apply_dp_with_ibm(df, epsilon=1.0):
    dp_mechanism = Laplace(epsilon=epsilon, sensitivity=1)
    
    df["dp_rating"] = df["Rating"].apply(lambda x: dp_mechanism.randomise(x))
    
    return df

epsilon = 5.0 
dp_ratings_df = apply_dp_with_ibm(ratings_df, epsilon=epsilon)

R_df = dp_ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'dp_rating').fillna(0)
print(R_df.head())

R = R_df.values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

MovieID    1.0     2.0     3.0     4.0     5.0       6.0     7.0     8.0     \
UserID                                                                        
1.0      4.805491     0.0     0.0     0.0     0.0  0.000000     0.0     0.0   
2.0      0.000000     0.0     0.0     0.0     0.0  0.000000     0.0     0.0   
3.0      0.000000     0.0     0.0     0.0     0.0  0.000000     0.0     0.0   
4.0      0.000000     0.0     0.0     0.0     0.0  0.000000     0.0     0.0   
5.0      0.000000     0.0     0.0     0.0     0.0  1.864472     0.0     0.0   

MovieID  9.0     10.0    ...  3943.0  3944.0  3945.0  3946.0  3947.0  3948.0  \
UserID                   ...                                                   
1.0         0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2.0         0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3.0         0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4.0         0.0     0.0  ...     0.0     0.0  

In [3]:
U, sigma, Vt = svds(R_demeaned, k = 50)
sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.head()
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    user_row_number = userID - 1
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)
    
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

already_rated, predictions = recommend_movies(preds_df, 30, movies_df, ratings_df, 10)
print("Already Rated: ")
for idx, row in already_rated.iterrows():
    print(f"{idx:<5} {row['Title']:<55} {row['Genres']}")
print("\n")
print("Predictions: ")
for idx, row in predictions.iterrows():
    print(f"{idx:<5} {row['Title']:<55} {row['Genres']}")

User 30 has already rated 43 movies.
Recommending highest 10 predicted ratings movies not already rated.
Already Rated: 
0     Double Life of Veronique, The (La Double Vie de Véronique) (1991) Drama
9     Silence of the Lambs, The (1991)                        Drama|Thriller
5     Bamboozled (2000)                                       Comedy
41    E.T. the Extra-Terrestrial (1982)                       Children's|Drama|Fantasy|Sci-Fi
29    Usual Suspects, The (1995)                              Crime|Thriller
31    Schindler's List (1993)                                 Drama|War
8     Groundhog Day (1993)                                    Comedy|Romance
10    Matrix, The (1999)                                      Action|Sci-Fi|Thriller
2     Being John Malkovich (1999)                             Comedy
7     Requiem for a Dream (2000)                              Drama
3     Terminator 2: Judgment Day (1991)                       Action|Sci-Fi|Thriller
28    American Beauty (1999)