In [1]:
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read movies.dat and ratings.dat

user_data = pd.io.parsers.read_csv('data/ratings.dat', names=['user_id', 'movie_id', 'rating','time'], 
                             engine = 'python', delimiter = '::')

movie_data = pd.io.parsers.read_csv('data/movies1.dat', names = ['movie_id', 'title', 'genre'], 
                                   engine = 'python', delimiter = '::')

In [3]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user_id   1000209 non-null  int64
 1   movie_id  1000209 non-null  int64
 2   rating    1000209 non-null  int64
 3   time      1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [4]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genre     3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [5]:
user_data.head()

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movie_data.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


##### 1. Create m x u matrix with movies as row and users as column. Normalize the matrix.

In [7]:
#Creating the rating matrix (rows as movies, columns as users)

ratings_mat = np.ndarray(shape=(np.max(user_data.movie_id.values), np.max(user_data.user_id.values)),
                         dtype=np.uint8)
ratings_mat[user_data.movie_id.values-1, user_data.user_id.values-1] = user_data.rating.values

Rating 5 from user_data where mmovie_id = 1193 and user_id = 1, rows and columns are reduced to 1 index as the matrix starts from 0


In [8]:
ratings_mat[1192][0]

5

In [9]:
# Normalizing the matrix

normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

##### 2. Compute SVD to get U, S and V. Use np.linalg.svd()

In [11]:
#Computing the Singular Value Decomposition (SVD)

start_time = time.time()
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)
print("Time taken for computing SVD: " + str(time.time() - start_time))

Time taken for computing SVD: 46.38531446456909


##### 4. Implement a function that take movieID as input and then implement cosine similarity along with sorting to recommend top 10 movies.

In [12]:
#Function to calculate the cosine similarity (sorting by most similar and returning the top N)

def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Function to print top N similar movies

def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

##### 3. From your V.T select 50 components.

In [13]:
#k-principal components to represent movies, movie_id to find recommendations, top_n print n results
       
k = 50
movie_id = 12 # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)

#Printing the top N similar movies
print_similar_movies(movie_data, movie_id, indexes)

Recommendations for Dracula: Dead and Loving It (1995): 

Dracula: Dead and Loving It (1995)
Spy Hard (1996)
Mafia! (1998)
Wrongfully Accused (1998)
Out to Sea (1997)
Bean (1997)
Beverly Hillbillies, The (1993)
Bio-Dome (1996)
8 Heads in a Duffel Bag (1997)
Repossessed (1990)


##### 5. Repeat the same process except now instead of using SVD you will use PCA to get the eigenvectors.

In [14]:
# Computing the Principle compenent analysis (PCA)

normalised_mat = ratings_mat - np.matrix(np.mean(ratings_mat, 1)).T

start_time = time.time()
cov_mat = np.cov(normalised_mat)
eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
print("Time taken for computing PCA: " + str(time.time() - start_time))

Time taken for computing PCA: 36.07937407493591


###### 7. Use that same steps after that to get 50 components. Use cosine similarity to get the results.

In [15]:
k = 50
movie_id = 12 # (getting an id from movies.dat)
top_n = 10
sliced = eigen_vectors[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)

#Printing the top N similar movies
print_similar_movies(movie_data, movie_id, indexes)

Recommendations for Dracula: Dead and Loving It (1995): 

Dracula: Dead and Loving It (1995)
Spy Hard (1996)
Mafia! (1998)
Wrongfully Accused (1998)
Out to Sea (1997)
Bean (1997)
Beverly Hillbillies, The (1993)
Bio-Dome (1996)
8 Heads in a Duffel Bag (1997)
Repossessed (1990)


##### 8. Compare the results for SVD and PCA.

 - We got the same results from both SVD and PCA
 - The input matrix A in step 2 has shape u×m. The V in SVD is the same as ATA(A Transpose A). 
 - The columns of V are the eigenvectors that correspond to the sorted eigenvalues in the diagonal of S.
 - By construction, ATA equals the covariance matrix of normalised_mat. 
 - Thus, the columns of V are the principal components of normalised_mat. 
 - But we can conclude that SVD is prefered since it has sigular values sorted but the computation of PCA was faster for this data. So, we have a tradeoff and choose which we prefer the most