In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.linalg import svd as SVD
from sklearn.metrics.pairwise import cosine_similarity

### PART - 1 Recommender System

#### Load the movies and ratings data.

In [2]:
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names= ['movie_id', 'title', 'genres'], encoding= 'ISO-8859-1', engine='python')
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None, names= ['user_id', 'movie_id', 'rating', 'timestamp'], encoding= 'ISO-8859-1', engine='python')
users = pd.read_table('ml-1m/users.dat', sep='::', header=None, names= ['user_id', 'gender', 'age', 'occupation', 'zip'], encoding= 'ISO-8859-1', engine='python')

In [3]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


#### Create m x u matrix with movies as row and users as column. Normalize the matrix.

Create a pivot table, where the rows are `movie_id`, columns are `user_id`, and the value for each cell is `rating` given by each user for the particular movie. Movies which haven't been rated by a particular user will have rating of 0.

In [6]:
m_u = ratings.pivot_table(values="rating", index="movie_id", columns="user_id", fill_value=0)

Print the `m x u` matrix

In [7]:
m_u

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,4,0,4,5,5,...,0,4,0,0,4,0,0,0,0,3
2,0,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,2,2,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0,0,0,0,0,0,0,0,3,4,...,0,0,0,0,0,0,0,0,0,0
3949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3951,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Create a mapping from `movie_id` to its index in `movies` table

In [8]:
movie_index = dict(zip(list(movies["movie_id"]), list(movies.index)))

Print the `min`, `max`, `mean` and `standard deviation` of `m x u` matrix, skipping the `NaN` values

In [9]:
m_u_np = m_u.to_numpy()
m_u_mean = np.nanmean(m_u_np)
m_u_std = np.nanstd(m_u_np)
print(f"minimum={np.nanmin(m_u_np)}\tmaximum={np.nanmax(m_u_np)}\tmean={np.nanmean(m_u_np)}\tstd.dev={np.nanstd(m_u_np)}")

minimum=0	maximum=5	mean=0.16003728516134566	std.dev=0.7767453240716777


Normalize the ratings using the following formula

```
(rating - mean_rating) / (std_dev_rating)
```

In [10]:
m_u_norm = m_u.fillna(0)
m_u_norm = (m_u_norm - np.nanmean(m_u_np)) / np.nanstd(m_u_np)

Print the normalized `m x u` matrix

In [11]:
m_u_norm

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6.231081,-0.206036,-0.206036,-0.206036,-0.206036,4.943657,-0.206036,4.943657,6.231081,6.231081,...,-0.206036,4.943657,-0.206036,-0.206036,4.943657,-0.206036,-0.206036,-0.206036,-0.206036,3.656234
2,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,6.231081,...,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036
3,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,...,-0.206036,-0.206036,-0.206036,-0.206036,1.081388,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036
4,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,3.656234,-0.206036,-0.206036,...,-0.206036,-0.206036,-0.206036,-0.206036,2.368811,2.368811,-0.206036,-0.206036,-0.206036,-0.206036
5,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,...,-0.206036,-0.206036,-0.206036,-0.206036,1.081388,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,3.656234,4.943657,...,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036
3949,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,...,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036
3950,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,...,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036
3951,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,...,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036,-0.206036


Print the `min`, `max`, `mean` and `standard deviation` of the normalized `m x u` matrix, skipping the `NaN` values

In [12]:
m_u_np = m_u_norm.to_numpy()
print(f"minimum={np.nanmin(m_u_np)}\tmaximum={np.nanmax(m_u_np)}\tmean={np.nanmean(m_u_np)}\tstd.dev={np.nanstd(m_u_np)}")

minimum=-0.20603572393900563	maximum=6.23108059340184	mean=-6.5720689204445244e-18	std.dev=1.0000000000000016


We can observe above that the `mean` is 0, and `standard deviation` is 1. Hence the matrix is now normalized

#### Perform SVD to get U, S and V.

In [13]:
U, S, V = np.linalg.svd(m_u_norm)

Reshape the S array

In [14]:
arr = np.zeros((U.shape[1], V.shape[0]))
for i in range(min(U.shape[1], V.shape[0])):
    arr[i,i] = S[i]
S = arr

Print the shape of U, S and V matrix

In [15]:
print(f"Shape of U - {U.shape}\tShape of S - {S.shape}\tShape of V - {V.shape}")

Shape of U - (3706, 3706)	Shape of S - (3706, 6040)	Shape of V - (6040, 6040)


#### Select top 50 components from S.

In [16]:
S50 = S[:50, :50]

Print the shape of S after selecting 50 components

In [17]:
print(f"Shape of S after selecting 50 components - {S50.shape}")

Shape of S after selecting 50 components - (50, 50)


In [18]:
#s50

S50

array([[2109.4952778 ,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,  948.83963882,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,  861.91357278, ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ...,  193.12003362,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
         191.2607562 ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,  189.78932573]])

#### Get the top 50 eigenvectors using eigenvalues.

In [19]:
U50 = U[:, :50]
V50 = V[:50, :]

Print the shape of U and V after selecting 50 components

In [20]:
print(f"Shape of U after selecting 50 components - {U50.shape}")
print(f"Shape of V after selecting 50 components - {V50.shape}")

Shape of U after selecting 50 components - (3706, 50)
Shape of V after selecting 50 components - (50, 6040)


In [21]:
U50

array([[ 0.07495553, -0.04034565,  0.01270731, ...,  0.08856314,
        -0.03093963,  0.00096511],
       [ 0.02215853,  0.01751448,  0.03343141, ..., -0.0034536 ,
        -0.00881508, -0.0137838 ],
       [ 0.01059724,  0.01479211,  0.01991311, ..., -0.01617532,
        -0.00899669,  0.01868013],
       ...,
       [-0.00204824,  0.01332386,  0.00042909, ..., -0.00531706,
        -0.00286808,  0.00529225],
       [-0.00383936,  0.01097383, -0.00032713, ..., -0.00589061,
         0.00174061,  0.0043681 ],
       [ 0.00993781,  0.0089659 , -0.00369609, ..., -0.03825145,
        -0.00858031,  0.02138087]])

In [22]:
V50

array([[ 3.36445237e-03,  8.64900797e-03,  3.84587400e-03, ...,
        -5.70585459e-04,  5.60360849e-03,  1.91689011e-02],
       [-1.21259294e-02, -1.19663402e-02, -1.40385284e-02, ...,
        -1.03081664e-02, -4.57516524e-03, -7.46139698e-04],
       [-3.48028609e-03,  6.69844769e-04,  1.12992470e-03, ...,
        -3.15307215e-03, -1.94856262e-02, -4.20497457e-02],
       ...,
       [ 6.91245110e-03,  1.01258715e-02,  1.97890769e-03, ...,
        -8.11611829e-04,  1.11450475e-02, -2.56686048e-02],
       [ 1.27382570e-02, -1.33235404e-02, -9.93991371e-04, ...,
         3.09208351e-03, -4.62110215e-03, -5.59329884e-03],
       [-8.38940475e-04,  5.94271581e-04,  7.58762135e-03, ...,
         1.25279794e-02,  2.67372450e-03, -5.87807496e-05]])

#### Using cosine similarity, find 10 closest movies using the 50 components from SVD.

Reconstruct the normalized `m x u` matrix with the U, S and V matrix with 50 components

In [23]:
m_u_np_new = (U50 @ S50) @ V50
predicted_m_u = m_u.copy()
predicted_m_u[:] = m_u_np_new

In [24]:
predicted_m_u

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.156947,0.760148,2.357475,0.115675,1.500563,2.523839,-0.275159,0.799032,4.456971,5.673327,...,0.955516,1.923252,-0.583750,-0.044229,2.921997,2.582941,2.345308,0.453160,1.881882,2.014384
2,-0.003463,0.010245,0.330585,-0.277829,-0.122473,0.020305,0.003553,0.009592,-0.025317,3.603222,...,-0.156797,0.185559,-0.036681,-0.283500,0.501827,0.186139,-0.012283,-0.405000,-0.293985,-0.353130
3,-0.488735,0.235342,-0.067601,-0.211803,-0.327196,0.116990,-0.137497,0.677166,-0.187968,1.443447,...,-0.423757,-0.656346,-0.091228,-0.276041,1.033621,0.248774,-0.226928,-0.125414,-0.402626,-0.459748
4,-0.251294,-0.217167,-0.214752,-0.114751,0.057711,-0.110054,-0.203440,0.220494,-0.266869,0.276883,...,-0.169598,-0.273865,-0.122075,-0.217993,0.251948,0.322864,-0.433695,-0.214950,-0.296046,-0.137629
5,-0.178732,-0.164774,-0.270015,-0.145034,-0.217804,0.027532,-0.265252,-0.303377,-0.309392,1.248496,...,-0.107427,-0.273778,-0.156152,-0.255871,0.554318,-0.305678,0.136141,-0.300504,-0.325325,-0.213254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.360760,-0.017096,-0.071893,-0.027555,-0.216943,0.806873,0.485450,-0.190605,1.732884,0.868065,...,-0.245328,-0.716634,-0.101099,-0.236167,-0.582026,-0.000196,-0.404082,-0.178461,-0.206173,0.021941
3949,-0.041324,-0.695177,-0.194706,-0.160085,0.505536,-0.052973,-0.532397,0.178237,0.204601,0.136409,...,-0.297285,-0.386915,-0.401363,-0.329883,-0.465347,0.676756,-0.264659,-0.402482,-0.480227,0.615242
3950,-0.171837,-0.323921,-0.221035,-0.259663,-0.152120,-0.203469,-0.277196,-0.250400,-0.106900,-0.150240,...,-0.191396,-0.289373,-0.266488,-0.249200,-0.305270,0.135038,-0.135988,-0.251412,-0.237694,-0.191721
3951,-0.147868,-0.284997,-0.196275,-0.178583,-0.035777,-0.174558,-0.227087,-0.228522,-0.210708,-0.208793,...,-0.155573,-0.292852,-0.202493,-0.179353,-0.200002,0.137283,-0.361908,-0.232738,-0.260529,-0.029797


To obtain the correct prediction matrix, we have to revert the normalization done before performing SVD using the following formula -

```
(rating * rating_std_dev) + rating_mean
```

In [25]:
predicted_m_u = (predicted_m_u * m_u_std) + m_u_mean

Print the reconstructed `m x u` matrix

In [26]:
predicted_m_u

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.165672,0.750479,1.991195,0.249887,1.325592,2.120417,-0.053691,0.780681,3.621969,4.566767,...,0.902230,1.653914,-0.293388,0.125683,2.429685,2.166325,1.981745,0.512027,1.621780,1.724701
2,0.157348,0.167995,0.416818,-0.055765,0.064907,0.175809,0.162797,0.167488,0.140373,2.958823,...,0.038246,0.304169,0.131546,-0.060170,0.549829,0.304620,0.150497,-0.154544,-0.068314,-0.114255
3,-0.219585,0.342838,0.107528,-0.004480,-0.094111,0.250909,0.053237,0.686023,0.014034,1.281228,...,-0.169114,-0.349777,0.089176,-0.054376,0.962897,0.353271,-0.016228,0.062622,-0.152701,-0.197070
4,-0.035154,-0.008646,-0.006771,0.070905,0.204864,0.074553,0.002016,0.331305,-0.047252,0.375105,...,0.028303,-0.052686,0.065216,-0.009287,0.355736,0.410820,-0.176833,-0.006924,-0.069915,0.053135
5,0.021208,0.032050,-0.049695,0.047383,-0.009141,0.181422,-0.045996,-0.075609,-0.080281,1.129800,...,0.076594,-0.052618,0.038747,-0.038710,0.590601,-0.077396,0.265784,-0.073378,-0.092657,-0.005607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.440256,0.146758,0.104195,0.138634,-0.008472,0.786772,0.537108,0.011986,1.506047,0.834302,...,-0.030520,-0.396605,0.081509,-0.023404,-0.292049,0.159885,-0.153832,0.021419,-0.000106,0.177080
3949,0.127939,-0.379938,0.008801,0.035692,0.552710,0.118891,-0.253500,0.298482,0.318960,0.265992,...,-0.070877,-0.140497,-0.151719,-0.096198,-0.201419,0.685704,-0.045536,-0.152589,-0.212977,0.637924
3950,0.026563,-0.091567,-0.011651,-0.041655,0.041879,0.001994,-0.055273,-0.034460,0.077003,0.043339,...,0.011371,-0.064732,-0.046956,-0.033528,-0.077080,0.264927,0.054409,-0.035246,-0.024590,0.011119
3951,0.045182,-0.061333,0.007581,0.021323,0.132248,0.024450,-0.016352,-0.017466,-0.003629,-0.002142,...,0.039197,-0.067434,0.002752,0.020725,0.004687,0.266671,-0.121073,-0.020741,-0.042327,0.136893


Given a string from a movie, find the movie ids from `movies` dataset

In [27]:
partial_movie_name = "Pretty Woman"

In [28]:
movie_list = movies.loc[movies.title.str.contains(partial_movie_name), "movie_id"].to_list()

In [29]:
movies.loc[movies["movie_id"].isin(movie_list)]

Unnamed: 0,movie_id,title,genres
593,597,Pretty Woman (1990),Comedy|Romance


In [30]:
movie_id = 597

Calculate pairwise cosine similarity between all the `movie_id` in predicted `m x u`

In [31]:
norms = np.sqrt(np.diag(predicted_m_u.dot(predicted_m_u.T)))
pairwise_sim = predicted_m_u.dot(predicted_m_u.T).div(norms, axis=0).div(norms, axis=1)
np.fill_diagonal(pairwise_sim.values, val=0)

Find the index of similar movies to the `movie_id` from above

In [32]:
similar_movie_ids = list(pairwise_sim[movie_id].sort_values(ascending=False).head(10).index)
similar_movie_index = [movie_index[x] for x in similar_movie_ids]

In [33]:
movies.loc[similar_movie_index]

Unnamed: 0,movie_id,title,genres
535,539,Sleepless in Seattle (1993),Comedy|Romance
1529,1569,My Best Friend's Wedding (1997),Comedy|Romance
583,587,Ghost (1990),Comedy|Romance|Thriller
2355,2424,You've Got Mail (1998),Comedy|Romance
335,339,While You Were Sleeping (1995),Comedy|Romance
2602,2671,Notting Hill (1999),Comedy|Romance
6,7,Sabrina (1995),Comedy|Romance
699,708,"Truth About Cats & Dogs, The (1996)",Comedy|Romance
10,11,"American President, The (1995)",Comedy|Drama|Romance
841,852,Tin Cup (1996),Comedy|Romance


SVD: We have a lot of movies and a lot of users, but not all users give reviews to all the movies. Hence the
pivot table mxu for reviews between movie ID and user ID is mostly sparse (contains a lot of 0s). After
performing SVD, we selected the top 50 components to capture the most important features, and
recombined the matrices to get a new mxu pivot table. But this new mxu pivot table was not sparse, and we
had predicted reviews for movies by users, which were previously 0.

Cosine Similarity: Now that each movie ID has a review score from each user associated with it, we can
consider these reviews as features for each movie vector. We selected the movie “Pretty Woman”, which
belongs to the genre “Comedy/Romance”. Using its predicted reviews as a feature vector, we performed
cosine similarity with all the other movies feature vectors in the new mxu pivot table, and identified the top
10 movies with highest similarity scores. As you can see above, the movie review score prediction made
using SVD was pretty good, as all the movies above belong to the “Comedy/Romance” genre. We can also
see that it suggested all the movies from the same decade as the movie “Pretty Woman”. These 2 attributes
(year and genre) were not even part of the SVD or cosine similarity process.

Conclusion: In conclusion, our recommender system employs a comprehensive approach to understand the
intricate dynamics between movies and user ratings. Through the utilization of singular value decomposition,
we identify the pivotal components that shape user preferences, allowing us to generate personalized film
recommendations.