In [1]:
import zipfile
import pandas as pd
import movie_utils
from tqdm import tqdm, trange
import numpy as np
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import random
from sklearn.impute import KNNImputer

%load_ext autoreload
%autoreload 2

In [11]:
# data from: https://grouplens.org/datasets/movielens/

files = {}

with zipfile.ZipFile("ml-32m.zip", 'r') as zip:
    zip_contents = zip.namelist()
    for file_name in zip_contents:
        if file_name.endswith('.csv'):
            print("Downloading " + file_name + "...")
            with zip.open(file_name) as file:
                df = pd.read_csv(file)
                files[file_name[7:-4]] = df

Downloading ml-32m/tags.csv...
Downloading ml-32m/links.csv...
Downloading ml-32m/ratings.csv...
Downloading ml-32m/movies.csv...


In [12]:
files["ratings"] = files["ratings"][["userId", "movieId", "rating"]]
files["ratings"].head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [13]:
files["movies"] = files["movies"].merge(files["links"], left_on="movieId", right_on="movieId", how="inner")
files["movies"].head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [14]:
movies = {}

for movie in files["movies"].iterrows():
    movie = movie[1]
    movies[movie.movieId] = movie_utils.Movie(movie.movieId, movie.tmdbId, movie.title, movie.genres)

In [76]:
popular_movies = files["ratings"][["movieId", "userId"]].groupby("movieId").count()
popular_movies = popular_movies.sort_values(by=["userId"], ascending=False)
popular_movies = popular_movies.index.values.tolist()

num_movies = 1000
top_popular_movies = popular_movies[:num_movies]
top_popular_movies_set = set(popular_movies[:num_movies])

In [None]:
total = len(files["ratings"])
ratings = np.empty((total, 3), dtype=np.float32)

i = 0
for rating in tqdm(files["ratings"].iterrows(), total=total):
    if rating[1].movieId in top_popular_movies_set:
        ratings[i, 0] = rating[1].userId
        ratings[i, 1] = rating[1].movieId
        ratings[i, 2] = rating[1].rating
        i += 1

ratings = ratings[:i]
np.savez_compressed("ratings.npz", ratings=ratings)

In [2]:
ratings = np.load("ratings.npz")["ratings"]

In [3]:
ratings_df = pd.DataFrame(ratings, columns=["userId", "movieId", "rating"], index=[i for i in range(len(ratings))])
ratings_df["userId"] = ratings_df["userId"].astype(int)
ratings_df["movieId"] = ratings_df["movieId"].astype(int)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,32,5.0
4,1,34,2.0


In [4]:
ratings_df = ratings_df.pivot_table(index="userId", columns="movieId", values="rating")
ratings_df

movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,3.5,,,,,4.0,4.0,,5.0,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,4.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200944,4.0,,,,,,,,,,...,,,,,,,,,,
200945,,,,,,,,,,,...,,,0.5,,,,,,,
200946,,,,,4.0,,5.0,5.0,,4.0,...,,,,,,,,,,
200947,4.0,,,,,,,,,,...,,,,,,,,,,


In [5]:
def isvt(X, r, max_iter=500, epsilon=1e-2):
    # Create a mask for the known entries in X
    mask = X > 0  
    prevM = np.zeros(X.shape)
    prevM[mask] = X[mask]

    progress_bar = trange(max_iter)
    
    for _ in progress_bar:
        currM = np.copy(prevM)
        
        # Perform Singular Value Decomposition
        U, S, Vt = svds(currM, k=r) # Threshold by only calculating the top r singular values
        currM = U @ np.diag(S) @ Vt

        # Fill in known values in currM from X
        currM[mask] = X[mask]
        
        # Check for convergence
        norm = np.linalg.norm(currM - prevM, ord='fro') / np.linalg.norm(prevM, ord='fro')
        if norm < epsilon:
            break
            
        progress_bar.set_description(f"[{r=:}, {norm=:.3e}]")
        
        prevM = currM
    
    return currM

In [6]:
def train_test_split(df, p):
    # Flatten the DataFrame to get all values with their index positions
    all_values = [(i, j, value) for i, row in df.iterrows() for j, value in row.items() if value > 0]

    # Calculate the number of values to pick based on the percentage
    num_values_to_pick = int(p * len(all_values))

    # Randomly pick the specified number of values
    test_values = random.sample(all_values, num_values_to_pick)

   # Create a copy of the DataFrame to modify
    new_df = df.copy()

    # Replace the picked values in the new DataFrame with 0
    for i, j, _ in test_values:
        new_df.at[i, j] = 0

    return new_df, test_values

def isvt_with_score(X, r, test_proportion=0.1, max_iter=500, epsilon=1e-2):
    # create train / test split 
    train_df, test_values = train_test_split(X, test_proportion)

    # run isvt algorithm with given rank
    M = isvt(train_df.to_numpy(), r=r, max_iter=max_iter, epsilon=epsilon)
    M = pd.DataFrame(M, columns=train_df.columns, index=train_df.index)

    # compare predicted values with expected ones
    scores = np.array([abs(rating - M.at[i, j]) for i, j, rating in test_values])
    avg_error = scores.mean()

    return (M, avg_error)

def cross_val_isvt(X, r_list, test_proportion=0.1, plot=True, max_iter=500, epsilon=1e-2):
    # calculate the error for each rank in r_list
    errors = [] 
    for r in r_list:
        errors.append(isvt_with_score(X, r, test_proportion, max_iter, epsilon))

    if plot:
        plt.plot(r_list, [err for _, err in errors])
        plt.xlabel("rank")
        plt.ylabel("error")

    # find the rank with the lowest error
    best = np.argmin(np.array([err for _, err in errors]))
    return (errors[best][0], r_list[best])

In [7]:
num_users = 50000
X = ratings_df[:num_users]

In [None]:
bestM, rank = cross_val_isvt(X, r_list=[2**n for n in range(1,7)])
print(f"Rank: {rank}")

In [8]:
M, error = isvt_with_score(X, 8, epsilon=1e-3)

[r=8, norm=1.000e-03]:  39%|███▉      | 195/500 [05:36<08:45,  1.72s/it]


In [90]:
print(f"error: {(error / 5) * 100:.2f}%")
M

error: 12.41%


movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.130706,1.863406,1.175800,0.815297,2.061491,1.424093,2.088346,1.522357,2.208290,4.000000,...,1.175213,0.500218,0.733015,1.479356,1.161857,0.630457,1.660773,1.098740,0.622368,0.866052
2,4.211290,4.112750,4.351093,4.971708,2.563554,5.423521,3.185113,5.310857,2.149869,5.724060,...,0.709438,0.671806,0.239584,-0.228929,1.470699,1.005146,0.788042,0.504890,0.751441,0.805097
3,3.851054,3.251562,3.482663,3.648247,3.257053,4.102841,4.000000,4.000000,2.912268,5.000000,...,2.080744,1.653356,1.910117,1.609810,2.307015,1.897915,2.110566,1.589057,1.695066,2.091212
4,2.859661,1.656752,0.837765,0.214008,1.930666,0.569570,2.244098,0.834630,1.907460,1.149426,...,-1.123324,-2.119746,-1.795576,-0.956522,-1.480006,-1.596748,-1.038560,-1.840344,-1.800006,-1.732436
5,3.130386,3.624445,3.507761,3.352712,2.701114,3.207951,3.550918,3.266031,2.349531,2.299260,...,0.572719,0.246386,-0.299459,-0.148603,0.228846,0.976009,0.345610,-0.224882,0.438151,0.082734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50009,2.739007,1.500000,2.029299,2.437736,3.080064,3.047748,1.616944,3.500000,3.083932,4.448437,...,2.614447,2.888768,3.487162,2.895699,3.290470,2.191859,2.778404,3.674355,3.113370,3.347174
50010,3.559957,3.294691,4.069877,4.612254,1.903236,5.000000,2.968272,5.211252,1.250889,5.000000,...,0.756442,0.046631,0.192825,-0.569131,1.117844,0.741501,0.635425,-0.849238,-0.244576,0.478183
50011,2.636208,2.479269,0.223092,0.117839,2.736114,-0.382033,2.465216,0.013066,2.327044,-0.379092,...,3.500000,3.681092,3.500000,3.500000,3.135074,3.660047,3.527604,3.796715,3.859203,3.644349
50012,3.526736,2.329613,-0.870057,-0.648606,3.151183,-0.978874,2.205343,-0.209917,2.381614,-0.212676,...,2.912173,2.956750,2.993579,2.753951,2.646347,2.731388,2.624693,3.093204,3.283033,3.204254


In [53]:
user_df = pd.DataFrame(columns=M.columns, index=[0], data=np.zeros(len(M.columns)).reshape(1, 1000))
# rate movies
user_df.at[0,122906] = 5
user_df.at[0,122914] = 5
user_df.at[0,122920] = 5
user_df.at[0,122926] = 5
user_df.at[0,195159] = 5
user_df.at[0,122918] = 5
user_df.at[0,95510] = 5
user_df.at[0,106072] = 5
user_df.at[0,122916] = 5

In [54]:
combinedM = np.vstack([M.to_numpy(), user_df.to_numpy()])
imputer = KNNImputer(n_neighbors=10, missing_values=0)
recommendations = imputer.fit_transform(combinedM)[-1].reshape(1, num_movies)

In [55]:
recommendations_df = pd.DataFrame(recommendations, columns=M.columns, index=[0])
sorted_recommendations = recommendations_df.loc[0].sort_values(ascending=False)
for i in sorted_recommendations.index[:20]:
    print(i, movies[i].title, movies[i].genres, sorted_recommendations[i])

122912 Avengers: Infinity War - Part I  ['Action', 'Adventure', 'Sci-Fi'] 5.057604154086011
110102 Captain America: The Winter Soldier  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 5.053035261457986
59315 Iron Man  ['Action', 'Adventure', 'Sci-Fi'] 5.048294226474329
122892 Avengers: Age of Ultron  ['Action', 'Adventure', 'Sci-Fi'] 5.039227873961392
89745 Avengers, The  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 5.036333050756077
77561 Iron Man 2  ['Action', 'Adventure', 'Sci-Fi', 'Thriller', 'IMAX'] 5.015493599512755
106489 Hobbit: The Desolation of Smaug, The  ['Adventure', 'Fantasy', 'IMAX'] 5.007357685654968
112852 Guardians of the Galaxy  ['Action', 'Adventure', 'Sci-Fi'] 5.00588097686448
88140 Captain America: The First Avenger  ['Action', 'Adventure', 'Sci-Fi', 'Thriller', 'War'] 5.000749512857931
86332 Thor  ['Action', 'Adventure', 'Drama', 'Fantasy', 'IMAX'] 5.00051913418666
95510 Amazing Spider-Man, The  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 5.0
122906 Black Panther  ['Action'

In [77]:
genres = ["Action", 
          "Adventure", 
          "Animation", 
          "Children", 
          "Comedy", 
          "Crime", 
          "Documentary", 
          "Drama", 
          "Fantasy", 
          "Film-Noir", 
          "Horror", 
          "Musical", 
          "Mystery", 
          "Romance", 
          "Sci-Fi", 
          "Thriller", 
          "War", 
          "Western",
          "IMAX"]

movie_by_genre = {}

for genre in genres:
    movie_by_genre[genre] = []

for movieId in top_popular_movies:
    for genre in movies[movieId].genres:
        movie_by_genre[genre].append(movies[movieId])
    

In [95]:
movie_by_genre["Action"][:5]

[Matrix, The 	1999
 Action | Sci-Fi | Thriller,
 Star Wars: Episode IV - A New Hope 	1977
 Action | Adventure | Sci-Fi,
 Fight Club 	1999
 Action | Crime | Drama | Thriller,
 Jurassic Park 	1993
 Action | Adventure | Sci-Fi | Thriller,
 Star Wars: Episode V - The Empire Strikes Back 	1980
 Action | Adventure | Sci-Fi]

In [93]:
genres_picked_by_user = ["Adventure", "Horror", "Sci-Fi"]
movies_to_rate = [movie_by_genre[genre][:5] for genre in genres_picked_by_user]
movies_to_rate

[[Star Wars: Episode IV - A New Hope 	1977
  Action | Adventure | Sci-Fi,
  Jurassic Park 	1993
  Action | Adventure | Sci-Fi | Thriller,
  Lord of the Rings: The Fellowship of the Ring, The 	2001
  Adventure | Fantasy,
  Star Wars: Episode V - The Empire Strikes Back 	1980
  Action | Adventure | Sci-Fi,
  Toy Story 	1995
  Adventure | Animation | Children | Comedy | Fantasy],
 [Silence of the Lambs, The 	1991
  Crime | Horror | Thriller,
  Sixth Sense, The 	1999
  Drama | Horror | Mystery,
  Alien 	1979
  Horror | Sci-Fi,
  Aliens 	1986
  Action | Adventure | Horror | Sci-Fi,
  Shining, The 	1980
  Horror],
 [Matrix, The 	1999
  Action | Sci-Fi | Thriller,
  Star Wars: Episode IV - A New Hope 	1977
  Action | Adventure | Sci-Fi,
  Jurassic Park 	1993
  Action | Adventure | Sci-Fi | Thriller,
  Star Wars: Episode V - The Empire Strikes Back 	1980
  Action | Adventure | Sci-Fi,
  Terminator 2: Judgment Day 	1991
  Action | Sci-Fi]]