In [1]:
import zipfile
import pandas as pd
import movie_utils
from tqdm import tqdm, trange
import numpy as np
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import random

%load_ext autoreload
%autoreload 2

In [2]:
# data from: https://grouplens.org/datasets/movielens/

files = {}

with zipfile.ZipFile("ml-32m.zip", 'r') as zip:
    zip_contents = zip.namelist()
    for file_name in zip_contents:
        if file_name.endswith('.csv'):
            print("Downloading " + file_name + "...")
            with zip.open(file_name) as file:
                df = pd.read_csv(file)
                files[file_name[7:-4]] = df

Downloading ml-32m/tags.csv...
Downloading ml-32m/links.csv...
Downloading ml-32m/ratings.csv...
Downloading ml-32m/movies.csv...


In [4]:
files["ratings"] = files["ratings"][["userId", "movieId", "rating"]]
files["ratings"].head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [5]:
files["movies"] = files["movies"].merge(files["links"], left_on="movieId", right_on="movieId", how="inner")
files["movies"].head()

Unnamed: 0,movieId,title,genres,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy,11862.0


In [6]:
movies = {}

for movie in files["movies"].iterrows():
    movie = movie[1]
    movies[movie.movieId] = movie_utils.Movie(movie.movieId, movie.tmdbId, movie.title, movie.genres)

In [None]:
popular_movies = files["ratings"][["movieId", "userId"]].groupby("movieId").count()
popular_movies = popular_movies.sort_values(by=["userId"], ascending=False)
popular_movies = popular_movies.index.values.tolist()

num_movies = 1000
top_popular_movies = set(popular_movies[:num_movies])

In [None]:
total = len(files["ratings"])
ratings = np.empty((total, 3), dtype=np.float32)

i = 0
for rating in tqdm(files["ratings"].iterrows(), total=total):
    if rating[1].movieId in top_popular_movies:
        ratings[i, 0] = rating[1].userId
        ratings[i, 1] = rating[1].movieId
        ratings[i, 2] = rating[1].rating
        i += 1

ratings = ratings[:i]
np.savez_compressed("ratings.npz", ratings=ratings)

In [7]:
ratings = np.load("ratings.npz")["ratings"]

In [8]:
ratings_df = pd.DataFrame(ratings, columns=["userId", "movieId", "rating"], index=[i for i in range(len(ratings))])
ratings_df["userId"] = ratings_df["userId"].astype(int)
ratings_df["movieId"] = ratings_df["movieId"].astype(int)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,32,5.0
4,1,34,2.0


In [9]:
ratings_df = ratings_df.pivot_table(index="userId", columns="movieId", values="rating")
ratings_df

movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,3.5,,,,,4.0,4.0,,5.0,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,4.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200944,4.0,,,,,,,,,,...,,,,,,,,,,
200945,,,,,,,,,,,...,,,0.5,,,,,,,
200946,,,,,4.0,,5.0,5.0,,4.0,...,,,,,,,,,,
200947,4.0,,,,,,,,,,...,,,,,,,,,,


In [10]:
def isvt(X, r, max_iter=500, epsilon=1e-2):
    # Create a mask for the known entries in X
    mask = X > 0  
    prevM = np.zeros(X.shape)
    prevM[mask] = X[mask]

    progress_bar = trange(max_iter)
    
    for _ in progress_bar:
        currM = np.copy(prevM)
        
        # Perform Singular Value Decomposition
        U, S, Vt = svds(currM, k=r) # Threshold by only calculating the top r singular values
        currM = U @ np.diag(S) @ Vt

        # Fill in known values in currM from X
        currM[mask] = X[mask]
        
        # Check for convergence
        norm = np.linalg.norm(currM - prevM, ord='fro') / np.linalg.norm(prevM, ord='fro')
        if norm < epsilon:
            break
            
        progress_bar.set_description(f"[{r=:}, {norm=:.3e}]")
        
        prevM = currM
    
    return currM

In [11]:
def train_test_split(df, p):
    # Flatten the DataFrame to get all values with their index positions
    all_values = [(i, j, value) for i, row in df.iterrows() for j, value in row.items() if value > 0]

    # Calculate the number of values to pick based on the percentage
    num_values_to_pick = int(p * len(all_values))

    # Randomly pick the specified number of values
    test_values = random.sample(all_values, num_values_to_pick)

   # Create a copy of the DataFrame to modify
    new_df = df.copy()

    # Replace the picked values in the new DataFrame with 0
    for i, j, _ in test_values:
        new_df.at[i, j] = 0

    return new_df, test_values

def isvt_with_score(X, r, test_proportion=0.1, max_iter=500, epsilon=1e-2):
    # create train / test split 
    train_df, test_values = train_test_split(X, test_proportion)

    # run isvt algorithm with given rank
    M = isvt(train_df.to_numpy(), r=r, max_iter=max_iter, epsilon=epsilon)
    M = pd.DataFrame(M, columns=train_df.columns, index=train_df.index)

    # compare predicted values with expected ones
    scores = np.array([abs(rating - M.at[i, j]) for i, j, rating in test_values])
    avg_error = scores.mean()

    return (M, avg_error)

def cross_val_isvt(X, r_list, test_proportion=0.1, plot=True, max_iter=500, epsilon=1e-2):
    # calculate the error for each rank in r_list
    errors = [] 
    for r in r_list:
        errors.append(isvt_with_score(X, r, test_proportion, max_iter, epsilon))

    if plot:
        plt.plot(r_list, [err for _, err in errors])
        plt.xlabel("rank")
        plt.ylabel("error")

    # find the rank with the lowest error
    best = np.argmin(np.array([err for _, err in errors]))
    return (errors[best][0], r_list[best])

In [13]:
num_users = 50000
X = ratings_df[:num_users]

In [None]:
bestM, rank = cross_val_isvt(X, r_list=[2**n for n in range(1,7)])
print(f"Rank: {rank}")

In [14]:
M, error = isvt_with_score(X, 8, epsilon=1e-3)

[r=8, norm=1.000e-03]:  39%|███▉      | 195/500 [05:15<08:12,  1.62s/it]


In [103]:
'''
Get movie ratings from users 
Add them to M 
Run algorithm
'''
user_df = pd.DataFrame(columns=M.columns, index=[0], data=np.zeros(len(M.columns)).reshape(1, 1000))
# rate movies
user_df.at[0,122906] = 5
user_df.at[0,122914] = 5
user_df.at[0,122920] = 5
user_df.at[0,122926] = 5
user_df.at[0,195159] = 5
user_df.at[0,122918] = 5
user_df.at[0,95510] = 5
user_df.at[0,106072] = 5
user_df.at[0,122916] = 5

userX_df = M.append(user_df)

In [104]:
userM = isvt(userX_df.to_numpy(), 8, epsilon=1e-3)
userM_df = pd.DataFrame(data=userM, columns=userX_df.columns, index=userX_df.index)

[r=8, norm=1.044e-03]:   5%|▌         | 27/500 [00:52<15:13,  1.93s/it]


In [105]:
sorted_userM = userM_df.loc[0].sort_values(ascending=False)
for i in sorted_userM.index:
    print(i, movies[i].title, movies[i].genres, sorted_userM[i])

122912 Avengers: Infinity War - Part I  ['Action', 'Adventure', 'Sci-Fi'] 5.044082427324749
122920 Captain America: Civil War  ['Action', 'Sci-Fi', 'Thriller'] 5.0
122926 Untitled Spider-Man Reboot  ['Action', 'Adventure', 'Fantasy'] 5.0
122914 Avengers: Infinity War - Part II  ['Action', 'Adventure', 'Sci-Fi'] 5.0
122916 Thor: Ragnarok  ['Action', 'Adventure', 'Sci-Fi'] 5.0
122918 Guardians of the Galaxy 2  ['Action', 'Adventure', 'Sci-Fi'] 5.0
122906 Black Panther  ['Action', 'Adventure', 'Sci-Fi'] 5.0
106072 Thor: The Dark World  ['Action', 'Adventure', 'Fantasy', 'IMAX'] 5.0
195159 Spider-Man: Into the Spider-Verse  ['Action', 'Adventure', 'Animation', 'Sci-Fi'] 5.0
95510 Amazing Spider-Man, The  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 5.0
110102 Captain America: The Winter Soldier  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 4.899444306077841
122892 Avengers: Age of Ultron  ['Action', 'Adventure', 'Sci-Fi'] 4.7544667922571096
122900 Ant-Man  ['Action', 'Adventure', 'Sci-Fi'] 4.673