In [26]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds


# Load the data
df = pd.read_csv('ratings.csv')

# Create a pivot table
#df_pivot = pd.DataFrame(data={'movie 1':[1,1,4],'movie 2':[2,np.nan,5],'movie 3':[3,4,5]},index=['user1','user2','user3'],columns=['movie 1','movie 2','movie 3'])
df_pivot = df.pivot(index='userId', columns='movieId', values='rating')

# Show the pivot table
df_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [27]:
# Load the user-item interaction matrix
interaction_matrix = df_pivot.values
interaction_matrix

array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

In [28]:
# Normalize the matrix
mean_rating = np.nanmean(interaction_matrix, axis=1)
std_rating = np.nanstd(interaction_matrix, axis=1)
interaction_matrix = (interaction_matrix - mean_rating[:, np.newaxis]) / std_rating[:, np.newaxis]
interaction_matrix

  interaction_matrix = (interaction_matrix - mean_rating[:, np.newaxis]) / std_rating[:, np.newaxis]


array([[-0.45893679,         nan, -0.45893679, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       ...,
       [-0.5879551 , -1.05151363, -1.05151363, ...,         nan,
                nan,         nan],
       [-0.60858062,         nan,         nan, ...,         nan,
                nan,         nan],
       [ 1.53010745,         nan,         nan, ...,         nan,
                nan,         nan]])

In [29]:
# Fill the NaN values with 0
interaction_matrix[np.isnan(interaction_matrix)] = 0
interaction_matrix

array([[-0.45893679,  0.        , -0.45893679, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.5879551 , -1.05151363, -1.05151363, ...,  0.        ,
         0.        ,  0.        ],
       [-0.60858062,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.53010745,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [33]:
# Decompose the matrix into the product of two lower-dimensional matrices
U, sigma, Vt = svds(interaction_matrix, k=50)
sigma = np.diag(sigma)

In [34]:
U, sigma, Vt

(array([[-0.0318845 ,  0.05556406, -0.02414733, ...,  0.00076408,
         -0.00970752,  0.02639654],
        [ 0.00334947,  0.00315777,  0.00368896, ..., -0.00053458,
          0.00332962, -0.00082504],
        [ 0.00486947, -0.00321506,  0.00397391, ...,  0.00219608,
          0.00500506, -0.00146747],
        ...,
        [ 0.10920122,  0.02542221, -0.0164896 , ..., -0.03358127,
         -0.01721186,  0.10626605],
        [ 0.01252725, -0.01033531, -0.01072608, ..., -0.01070113,
         -0.00166173,  0.0079267 ],
        [-0.02622058,  0.02196636, -0.0136081 , ..., -0.06830087,
          0.2109543 ,  0.18667272]]),
 array([[21.61895477,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , 21.79942942,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        , 21.98253702, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., 4

In [35]:
U.shape, sigma.shape, Vt.shape

((610, 50), (50, 50), (50, 9724))

In [36]:
# Make recommendations to the user by finding the items that are most similar
# to the items that the user has interacted with in the past
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings

array([[-5.83351344e-02, -1.04791420e-01, -2.23785899e-02, ...,
         3.38434923e-04,  3.38434923e-04, -2.58987482e-03],
       [-2.32532019e-02, -5.74253473e-03, -1.76551179e-02, ...,
         3.44832527e-05,  3.44832527e-05, -1.87115032e-04],
       [ 5.65388323e-03,  8.45017343e-03,  1.81102330e-03, ...,
        -4.28537284e-05, -4.28537284e-05,  8.82781616e-05],
       ...,
       [-7.29396591e-01, -9.61170576e-01, -1.13806150e+00, ...,
        -1.42043546e-04, -1.42043546e-04, -1.18722904e-02],
       [ 4.77162308e-02,  3.96736895e-02,  2.29872757e-02, ...,
         1.74509258e-04,  1.74509258e-04,  1.27564104e-03],
       [ 1.54179519e+00,  5.87063373e-02,  3.65872621e-02, ...,
        -4.92737299e-04, -4.92737299e-04,  1.10400824e-02]])

In [37]:
# convert the predicted ratings to a dataframe and transform the values back to the original scale
predicted_ratings = pd.DataFrame(predicted_ratings * std_rating[:, np.newaxis] + mean_rating[:, np.newaxis], index=df_pivot.index, columns=df_pivot.columns)
predicted_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.319809,4.282722,4.348514,4.394789,4.377803,4.131058,4.173495,4.369117,4.390611,4.241697,...,4.366649,4.367308,4.365991,4.365991,4.366649,4.365991,4.366649,4.366649,4.366649,4.364312
2,3.929869,3.943730,3.934300,3.954583,3.960442,3.943352,3.935751,3.951235,3.952845,3.971969,...,3.948303,3.948370,3.948237,3.948237,3.948303,3.948237,3.948303,3.948303,3.948303,3.948128
3,2.447565,2.453336,2.439635,2.445986,2.452195,2.445209,2.472395,2.447201,2.431662,2.460779,...,2.435809,2.435594,2.436024,2.436024,2.435809,2.436024,2.435809,2.435809,2.435809,2.436080
4,3.679716,3.636079,3.520904,3.528697,3.540129,3.438810,3.504119,3.546557,3.570956,3.533194,...,3.556203,3.557781,3.554625,3.554625,3.556203,3.554625,3.556203,3.556203,3.556203,3.555619
5,3.789600,3.663588,3.601206,3.614451,3.601351,3.676827,3.640098,3.625673,3.620369,3.543375,...,3.636613,3.637220,3.636006,3.636006,3.636613,3.636006,3.636613,3.636613,3.636613,3.636620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.216277,3.732932,3.694564,3.656484,3.674544,3.642886,2.450876,3.646871,3.673843,3.677850,...,3.658357,3.660690,3.656024,3.656024,3.658357,3.656024,3.658357,3.658357,3.658357,3.653879
607,3.838877,3.979212,3.973072,3.808801,3.841568,3.876701,3.764532,3.767398,3.765451,3.638307,...,3.785252,3.783195,3.787309,3.787309,3.785252,3.787309,3.785252,3.785252,3.785252,3.785438
608,2.347439,2.097445,1.906648,3.163952,3.096443,3.438522,3.167129,3.077456,3.071043,4.304726,...,3.134022,3.133649,3.134396,3.134396,3.134022,3.134396,3.134022,3.134022,3.134022,3.121370
609,3.291461,3.287889,3.280479,3.248687,3.266128,3.322889,3.275148,3.266147,3.260427,3.268855,...,3.270348,3.270537,3.270159,3.270159,3.270348,3.270159,3.270348,3.270348,3.270348,3.270837


In [44]:
# Select a user
user = 608

# get the ratings for the previously not rated movies
not_rated = df_pivot.iloc[user][df_pivot.iloc[user].isnull()]
not_rated

movieId
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
          ..
193581   NaN
193583   NaN
193585   NaN
193587   NaN
193609   NaN
Name: 609, Length: 9687, dtype: float64

In [46]:
# get the predicted ratings for the not rated movies
predicted_ratings_not_rated = predicted_ratings.iloc[user][not_rated.index]
predicted_ratings_not_rated

movieId
2         3.287889
3         3.280479
4         3.248687
5         3.266128
6         3.322889
            ...   
193581    3.270159
193583    3.270348
193585    3.270348
193587    3.270348
193609    3.270837
Name: 609, Length: 9687, dtype: float64

In [53]:
# the top 10 recommended items for the user
top_30_reccomandation = predicted_ratings_not_rated.sort_values(ascending=False)[:10]
top_30_reccomandation

movieId
593     3.460257
527     3.420910
50      3.417769
47      3.367018
2959    3.358400
858     3.357343
2329    3.355702
11      3.330612
32      3.329462
1704    3.329141
Name: 609, dtype: float64

In [None]:
# results from the naive collaborative filtering (top 10) for user 608
#1196    4.235235
#2959    4.029476
#2571    3.922200
#260     3.871156
#2858    3.819967
#1210    3.757071
#858     3.653868
#2115    3.640236
#4878    3.534362
#1291    3.513320

#see: https://github.com/Ishikawa7/Simple-Collaborative-Filtering-with-Pandas

In [54]:
results_naive = [1196, 2959, 2571, 260, 2858, 1210, 858, 2115, 4878, 1291]

# filter the movies that are also in the naive collaborative filtering
top_30_reccomandation[top_30_reccomandation.index.isin(results_naive)]

movieId
2959    3.358400
858     3.357343
Name: 609, dtype: float64

In [None]:
# only 2 movies are in the top 10 of the naive collaborative filtering