In [1]:
import zipfile
import pandas as pd
import movie_utils
from tqdm import tqdm, trange
import numpy as np
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import random
from sklearn.impute import KNNImputer
import pickle
import gzip
import random

%load_ext autoreload
%autoreload 2

In [2]:
# data from: https://grouplens.org/datasets/movielens/

files = {}

with zipfile.ZipFile("ml-32m.zip", 'r') as zip:
    zip_contents = zip.namelist()
    for file_name in zip_contents:
        if file_name.endswith('.csv'):
            print("Downloading " + file_name + "...")
            with zip.open(file_name) as file:
                df = pd.read_csv(file)
                files[file_name[7:-4]] = df

Downloading ml-32m/tags.csv...
Downloading ml-32m/links.csv...
Downloading ml-32m/ratings.csv...
Downloading ml-32m/movies.csv...


In [3]:
ratings_df_all = files["ratings"][["userId", "movieId", "rating"]]
ratings_df_all.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [4]:
movies_df_all = files["movies"].merge(files["links"], left_on="movieId", right_on="movieId", how="inner")
movies_df_all.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [5]:
all_movies = {}

for _, movie in movies_df_all.iterrows():
    all_movies[movie.movieId] = movie_utils.Movie(movie.movieId, movie.tmdbId, movie.title, movie.genres)

In [6]:
popular_movies = files["ratings"][["movieId", "userId"]].groupby("movieId").count()
popular_movies = popular_movies.sort_values(by=["userId"], ascending=False)
popular_movies = popular_movies.index.values.tolist()

num_movies = 1000
top_popular_movies = popular_movies[:num_movies]
top_popular_movies_set = set(popular_movies[:num_movies])

In [7]:
# get all attributes for the top 1000 popular movies

movies = {}
total = len(movies_df_all)

for _, movie in tqdm(movies_df_all.iterrows(), total=total):
    if movie.movieId in top_popular_movies_set:
        all_movies[movie.movieId].loadAllAttributes()
        movies[movie.movieId] = all_movies[movie.movieId]
    
serialized_data = pickle.dumps(movies)
    
with gzip.open("movie_objects" + str(random.random())[:8], 'wb') as f:
    f.write(serialized_data)

100%|██████████| 87585/87585 [16:58<00:00, 86.03it/s]   


In [None]:
# get the ratings for the top 1000 popular movies

total = len(ratings_df_all)
ratings = np.empty((total, 3), dtype=np.float32)

i = 0
for _, rating in tqdm(ratings_df_all.iterrows(), total=total):
    if rating.movieId in top_popular_movies_set:
        ratings[i, 0] = rating.userId
        ratings[i, 1] = rating.movieId
        ratings[i, 2] = rating.rating
        i += 1

ratings = ratings[:i]
np.savez_compressed("ratings.npz", ratings=ratings)

In [None]:
ratings = np.load("ratings.npz")["ratings"]

In [None]:
ratings_df = pd.DataFrame(ratings, columns=["userId", "movieId", "rating"], index=[i for i in range(len(ratings))])
ratings_df["userId"] = ratings_df["userId"].astype(int)
ratings_df["movieId"] = ratings_df["movieId"].astype(int)
ratings_df.head()

In [None]:
ratings_df = ratings_df.pivot_table(index="userId", columns="movieId", values="rating")
ratings_df

In [None]:
def isvt(X, r, max_iter=500, epsilon=1e-2):
    # Create a mask for the known entries in X
    mask = X > 0  
    prevM = np.zeros(X.shape)
    prevM[mask] = X[mask]

    progress_bar = trange(max_iter)
    
    for _ in progress_bar:
        currM = np.copy(prevM)
        
        # Perform Singular Value Decomposition
        U, S, Vt = svds(currM, k=r) # Threshold by only calculating the top r singular values
        currM = U @ np.diag(S) @ Vt

        # Fill in known values in currM from X
        currM[mask] = X[mask]
        
        # Check for convergence
        norm = np.linalg.norm(currM - prevM, ord='fro') / np.linalg.norm(prevM, ord='fro')
        if norm < epsilon:
            break
            
        progress_bar.set_description(f"[{r=:}, {norm=:.3e}]")
        
        prevM = currM
    
    return currM

In [None]:
def train_test_split(df, p):
    # Flatten the DataFrame to get all values with their index positions
    all_values = [(i, j, value) for i, row in df.iterrows() for j, value in row.items() if value > 0]

    # Calculate the number of values to pick based on the percentage
    num_values_to_pick = int(p * len(all_values))

    # Randomly pick the specified number of values
    test_values = random.sample(all_values, num_values_to_pick)

   # Create a copy of the DataFrame to modify
    new_df = df.copy()

    # Replace the picked values in the new DataFrame with 0
    for i, j, _ in test_values:
        new_df.at[i, j] = 0

    return new_df, test_values

def isvt_with_score(X, r, test_proportion=0.1, max_iter=500, epsilon=1e-2):
    # create train / test split 
    train_df, test_values = train_test_split(X, test_proportion)

    # run isvt algorithm with given rank
    M = isvt(train_df.to_numpy(), r=r, max_iter=max_iter, epsilon=epsilon)
    M = pd.DataFrame(M, columns=train_df.columns, index=train_df.index)

    # compare predicted values with expected ones
    scores = np.array([abs(rating - M.at[i, j]) for i, j, rating in test_values])
    avg_error = scores.mean()

    return (M, avg_error)

def cross_val_isvt(X, r_list, test_proportion=0.1, plot=True, max_iter=500, epsilon=1e-2):
    # calculate the error for each rank in r_list
    errors = [] 
    for r in r_list:
        errors.append(isvt_with_score(X, r, test_proportion, max_iter, epsilon))

    if plot:
        plt.plot(r_list, [err for _, err in errors])
        plt.xlabel("rank")
        plt.ylabel("error")

    # find the rank with the lowest error
    best = np.argmin(np.array([err for _, err in errors]))
    return (errors[best][0], r_list[best])

In [None]:
num_users = 50000
X = ratings_df[:num_users]

In [None]:
bestM, rank = cross_val_isvt(X, r_list=[2**n for n in range(1,7)])
print(f"Rank: {rank}")

In [None]:
M, error = isvt_with_score(X, 8, epsilon=1e-3)

In [None]:
print(f"error: {(error / 5) * 100:.2f}%")
M

In [None]:
user_df = pd.DataFrame(columns=M.columns, index=[0], data=np.zeros(len(M.columns)).reshape(1, 1000))
# rate movies
user_df.at[0,122906] = 5
user_df.at[0,122914] = 5
user_df.at[0,122920] = 5
user_df.at[0,122926] = 5
user_df.at[0,195159] = 5
user_df.at[0,122918] = 5
user_df.at[0,95510] = 5
user_df.at[0,106072] = 5
user_df.at[0,122916] = 5

In [None]:
combinedM = np.vstack([M.to_numpy(), user_df.to_numpy()])
imputer = KNNImputer(n_neighbors=10, missing_values=0)
recommendations = imputer.fit_transform(combinedM)[-1].reshape(1, num_movies)

In [None]:
recommendations_df = pd.DataFrame(recommendations, columns=M.columns, index=[0])
sorted_recommendations = recommendations_df.loc[0].sort_values(ascending=False)
for i in sorted_recommendations.index[:20]:
    print(i, all_movies[i].title, all_movies[i].genres, sorted_recommendations[i])

In [None]:
genres = ["Action", 
          "Adventure", 
          "Animation", 
          "Children", 
          "Comedy", 
          "Crime", 
          "Documentary", 
          "Drama", 
          "Fantasy", 
          "Film-Noir", 
          "Horror", 
          "Musical", 
          "Mystery", 
          "Romance", 
          "Sci-Fi", 
          "Thriller", 
          "War", 
          "Western",
          "IMAX"]

movie_by_genre = {}

for genre in genres:
    movie_by_genre[genre] = []

for movieId in top_popular_movies:
    for genre in all_movies[movieId].genres:
        movie_by_genre[genre].append(all_movies[movieId])
    

In [None]:
movie_by_genre["Action"][:5]

In [None]:
genres_picked_by_user = ["Adventure", "Horror", "Sci-Fi"]
movies_to_rate = [movie_by_genre[genre][:5] for genre in genres_picked_by_user]
movies_to_rate