In [1]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
final_dataset = ratings.pivot(index='movieId', columns='userId', values='rating')

In [4]:
final_dataset.fillna(0, inplace=True)

In [5]:
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

In [6]:
final_dataset = final_dataset.loc[:, no_movies_voted[no_movies_voted >50].index]

In [8]:
sample = np.array([[0,0,3,0,0], [4,0,0,0,2], [0,0,0,0,1]])
sparsity = 1.0 - (np.count_nonzero(sample) / float(sample.size))
print(sparsity)

0.7333333333333334


In [9]:
csr_sample = csr_matrix(sample)


In [7]:
csr_data = csr_matrix(final_dataset.values)

In [8]:
csr_data = csr_matrix(final_dataset.values)
final_dataset.reset_index(inplace=True)

In [9]:
from sklearn.utils.extmath import randomized_svd

U, S, V = randomized_svd(csr_data, 
                              n_components=15,
                              n_iter=5,
                              random_state=42)

In [10]:
movie_data = pd.read_csv("movies.csv")
data = pd.read_csv("ratings.csv")

In [11]:
#Computing the Singular Value Decomposition (SVD)

#Function to calculate the cosine similarity (sorting by most similar and returning the top N)
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movieId == movie_id].title.values))
    for id in top_indexes + 1:
        print((movie_data[movie_data.movieId == id].title.values)[0])

In [12]:
#k-principal components to represent movies, movie_id to find recommendations, top_n print n results        
k = 10
movie_id = 377 # (getting an id from movies.dat)
top_n = 5
similarities = np.dot(V.T, V)
top_movies = top_cosine_similarity(similarities, movie_id, 10)
print_similar_movies(movies, movie_id, top_movies)

Recommendations for ['Speed (1994)']: 

Speed (1994)
My Crazy Life (Mi vida loca) (1993)
Down Periscope (1996)
Milk Money (1994)
Legends of the Fall (1994)
Cry, the Beloved Country (1995)
Species (1995)
Circle of Friends (1995)
Pushing Hands (Tui shou) (1992)
Copycat (1995)
