In [1]:
import zipfile
import pandas as pd
import movie_utils
from tqdm import tqdm, trange
import numpy as np
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import random
from sklearn.impute import KNNImputer
import pickle
import gzip
import random

%load_ext autoreload
%autoreload 2

In [2]:
# data from: https://grouplens.org/datasets/movielens/

files = {}

with zipfile.ZipFile("ml-32m.zip", 'r') as zip:
    zip_contents = zip.namelist()
    for file_name in zip_contents:
        if file_name.endswith('.csv'):
            print("Downloading " + file_name + "...")
            with zip.open(file_name) as file:
                df = pd.read_csv(file)
                files[file_name[7:-4]] = df

Downloading ml-32m/tags.csv...
Downloading ml-32m/links.csv...
Downloading ml-32m/ratings.csv...
Downloading ml-32m/movies.csv...


In [3]:
ratings_df_all = files["ratings"][["userId", "movieId", "rating"]]
ratings_df_all.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [54]:
links = files["links"][["movieId", "tmdbId"]].set_index("tmdbId")
links.to_csv("id_conversion_chart.csv")

In [4]:
movies_df_all = files["movies"].merge(files["links"], left_on="movieId", right_on="movieId", how="inner")
movies_df_all.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [5]:
all_movies = {}

for _, movie in movies_df_all.iterrows():
    all_movies[movie.movieId] = movie_utils.Movie(movie.movieId, movie.tmdbId, movie.title, movie.genres)

In [39]:
popular_movies = files["ratings"][["movieId", "userId"]].groupby("movieId").count()
popular_movies = popular_movies.sort_values(by=["userId"], ascending=False)
popular_movies = popular_movies.index.values.tolist()

num_movies = 1000
top_popular_movies = popular_movies[:num_movies]
np.savez("popular_movies.npz", top_popular_movies=top_popular_movies)
top_popular_movies_set = set(popular_movies[:num_movies])

In [7]:
# get all attributes for the top 1000 popular movies

movies = {}
total = len(movies_df_all)

for _, movie in tqdm(movies_df_all.iterrows(), total=total):
    if movie.movieId in top_popular_movies_set:
        all_movies[movie.movieId].loadAllAttributes()
        movies[movie.movieId] = all_movies[movie.movieId]
    
serialized_data = pickle.dumps(movies)
    
with gzip.open("movie_objects" + str(random.random())[:8], 'wb') as f:
    f.write(serialized_data)

100%|██████████| 87585/87585 [16:58<00:00, 86.03it/s]   


In [None]:
# get the ratings for the top 1000 popular movies

total = len(ratings_df_all)
ratings = np.empty((total, 3), dtype=np.float32)

i = 0
for _, rating in tqdm(ratings_df_all.iterrows(), total=total):
    if rating.movieId in top_popular_movies_set:
        ratings[i, 0] = rating.userId
        ratings[i, 1] = rating.movieId
        ratings[i, 2] = rating.rating
        i += 1

ratings = ratings[:i]
np.savez_compressed("ratings.npz", ratings=ratings)

In [9]:
ratings = np.load("ratings.npz")["ratings"]

In [10]:
ratings_df = pd.DataFrame(ratings, columns=["userId", "movieId", "rating"], index=[i for i in range(len(ratings))])
ratings_df["userId"] = ratings_df["userId"].astype(int)
ratings_df["movieId"] = ratings_df["movieId"].astype(int)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,32,5.0
4,1,34,2.0


In [11]:
ratings_df = ratings_df.pivot_table(index="userId", columns="movieId", values="rating")
ratings_df

movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,3.5,,,,,4.0,4.0,,5.0,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,4.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200944,4.0,,,,,,,,,,...,,,,,,,,,,
200945,,,,,,,,,,,...,,,0.5,,,,,,,
200946,,,,,4.0,,5.0,5.0,,4.0,...,,,,,,,,,,
200947,4.0,,,,,,,,,,...,,,,,,,,,,


In [12]:
def isvt(X, r, max_iter=500, epsilon=1e-2):
    # Create a mask for the known entries in X
    mask = X > 0  
    prevM = np.zeros(X.shape)
    prevM[mask] = X[mask]

    progress_bar = trange(max_iter)
    
    for _ in progress_bar:
        currM = np.copy(prevM)
        
        # Perform Singular Value Decomposition
        U, S, Vt = svds(currM, k=r) # Threshold by only calculating the top r singular values
        currM = U @ np.diag(S) @ Vt

        # Fill in known values in currM from X
        currM[mask] = X[mask]
        
        # Check for convergence
        norm = np.linalg.norm(currM - prevM, ord='fro') / np.linalg.norm(prevM, ord='fro')
        if norm < epsilon:
            break
            
        progress_bar.set_description(f"[{r=:}, {norm=:.3e}]")
        
        prevM = currM
    
    return currM

In [13]:
def train_test_split(df, p):
    # Flatten the DataFrame to get all values with their index positions
    all_values = [(i, j, value) for i, row in df.iterrows() for j, value in row.items() if value > 0]

    # Calculate the number of values to pick based on the percentage
    num_values_to_pick = int(p * len(all_values))

    # Randomly pick the specified number of values
    test_values = random.sample(all_values, num_values_to_pick)

   # Create a copy of the DataFrame to modify
    new_df = df.copy()

    # Replace the picked values in the new DataFrame with 0
    for i, j, _ in test_values:
        new_df.at[i, j] = 0

    return new_df, test_values

def isvt_with_score(X, r, test_proportion=0.1, max_iter=500, epsilon=1e-2):
    # create train / test split 
    train_df, test_values = train_test_split(X, test_proportion)

    # run isvt algorithm with given rank
    M = isvt(train_df.to_numpy(), r=r, max_iter=max_iter, epsilon=epsilon)
    M = pd.DataFrame(M, columns=train_df.columns, index=train_df.index)

    # compare predicted values with expected ones
    scores = np.array([abs(rating - M.at[i, j]) for i, j, rating in test_values])
    avg_error = scores.mean()

    return (M, avg_error)

def cross_val_isvt(X, r_list, test_proportion=0.1, plot=True, max_iter=500, epsilon=1e-2):
    # calculate the error for each rank in r_list
    errors = [] 
    for r in r_list:
        errors.append(isvt_with_score(X, r, test_proportion, max_iter, epsilon))

    if plot:
        plt.plot(r_list, [err for _, err in errors])
        plt.xlabel("rank")
        plt.ylabel("error")

    # find the rank with the lowest error
    best = np.argmin(np.array([err for _, err in errors]))
    return (errors[best][0], r_list[best])

In [14]:
num_users = 50000
X = ratings_df[:num_users]

In [None]:
bestM, rank = cross_val_isvt(X, r_list=[2**n for n in range(1,7)])
print(f"Rank: {rank}")

In [15]:
M, error = isvt_with_score(X, 8, epsilon=1e-3)

[r=8, norm=1.005e-03]:  39%|███▉      | 194/500 [06:06<09:38,  1.89s/it]


In [16]:
print(f"error: {(error / 5) * 100:.2f}%")
M

error: 12.44%


movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.985022,1.454468,0.606226,0.437546,2.465173,0.983519,1.791573,1.339391,2.479574,4.000000,...,1.445111,1.018099,1.505868,1.777219,1.346961,0.852462,1.666533,1.658514,1.244949,1.461946
2,4.176921,4.122690,4.489601,5.107315,2.527633,5.452974,3.131732,5.372259,2.205279,5.688760,...,0.308722,0.416266,-0.106776,-0.585876,1.221840,0.535243,0.474621,0.314362,0.389008,0.543215
3,3.808658,3.500000,3.499225,3.631146,2.832594,4.018197,4.000000,4.000000,2.429734,5.000000,...,1.598450,1.071452,1.256375,0.903713,1.752078,1.497791,1.582624,0.810243,1.050864,1.516350
4,2.974515,1.994780,0.897695,0.421673,1.702462,0.508589,2.408618,0.806692,1.522303,0.931483,...,-1.882536,-2.870765,-2.657805,-1.820206,-2.100875,-2.184592,-1.964554,-2.592997,-2.503370,-2.360141
5,3.095480,3.614925,3.702545,3.561378,2.768340,3.420135,3.545243,3.512977,2.409829,2.444453,...,0.409806,0.128775,-0.401505,-0.369330,0.113453,0.654098,0.178681,-0.355484,0.198432,-0.030628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50009,2.693937,1.500000,2.205066,2.561432,3.027944,3.224990,1.651651,3.500000,3.162321,4.506894,...,2.820010,3.168831,3.691348,3.129288,3.463420,2.463532,3.086079,3.956321,3.376890,3.578289
50010,3.864105,3.512285,4.112904,4.728486,1.644818,5.000000,2.973727,5.340085,0.985782,5.000000,...,-0.040810,-0.661745,-0.528495,-1.508528,0.559261,-0.029599,-0.143490,-1.504654,-0.945590,-0.130730
50011,2.856903,2.698948,0.232576,0.060326,2.826851,-0.454098,2.703723,-0.077098,2.303280,-0.431207,...,3.640289,3.641647,3.500000,3.516315,3.119503,3.682616,3.606480,3.686069,3.809305,3.585765
50012,3.385530,2.196609,-0.647633,-0.492729,3.207262,-0.769499,2.159226,-0.019220,2.502996,0.188484,...,2.846425,2.982817,3.081327,2.743699,2.624171,2.592957,2.569947,3.250893,3.376716,3.248194


In [21]:
M.to_csv("ratings_matrix.csv")

In [57]:
user_df = pd.DataFrame(columns=M.columns, index=[0], data=np.zeros(len(M.columns)).reshape(1, 1000))
# rate movies
user_df.at[0,122906] = 5
user_df.at[0,122914] = 5
user_df.at[0,122920] = 5
user_df.at[0,122926] = 5
user_df.at[0,195159] = 5
user_df.at[0,122918] = 5
user_df.at[0,95510] = 5
user_df.at[0,106072] = 5
user_df.at[0,122916] = 5
user_df

movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0


In [29]:
combinedM = np.vstack([M.to_numpy(), user_df.to_numpy()])
imputer = KNNImputer(n_neighbors=10, missing_values=0)
recommendations = imputer.fit_transform(combinedM)[-1].reshape(1, num_movies)

In [34]:
recommendations_df = pd.DataFrame(recommendations, columns=M.columns, index=[0])
sorted_recommendations = recommendations_df.loc[0].sort_values(ascending=False)
for i in sorted_recommendations.index[:20]:
    print(i, all_movies[i].title, all_movies[i].genres, sorted_recommendations[i])

106489 Hobbit: The Desolation of Smaug, The  ['Adventure', 'Fantasy', 'IMAX'] 5.102559982855476
122912 Avengers: Infinity War - Part I  ['Action', 'Adventure', 'Sci-Fi'] 5.0876489702200995
89745 Avengers, The  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 5.081081122924502
59315 Iron Man  ['Action', 'Adventure', 'Sci-Fi'] 5.059181644279595
122892 Avengers: Age of Ultron  ['Action', 'Adventure', 'Sci-Fi'] 5.050888861209714
118696 The Hobbit: The Battle of the Five Armies  ['Adventure', 'Fantasy'] 5.045490060376737
112852 Guardians of the Galaxy  ['Action', 'Adventure', 'Sci-Fi'] 5.031927845221171
86332 Thor  ['Action', 'Adventure', 'Drama', 'Fantasy', 'IMAX'] 5.010437822565402
110102 Captain America: The Winter Soldier  ['Action', 'Adventure', 'Sci-Fi', 'IMAX'] 5.008595496721936
122900 Ant-Man  ['Action', 'Adventure', 'Sci-Fi'] 5.0
122906 Black Panther  ['Action', 'Adventure', 'Sci-Fi'] 5.0
106072 Thor: The Dark World  ['Action', 'Adventure', 'Fantasy', 'IMAX'] 5.0
122914 Avengers: Infinity

In [31]:
genres = ["Action", 
          "Adventure", 
          "Animation", 
          "Children", 
          "Comedy", 
          "Crime", 
          "Documentary", 
          "Drama", 
          "Fantasy", 
          "Film-Noir", 
          "Horror", 
          "Musical", 
          "Mystery", 
          "Romance", 
          "Sci-Fi", 
          "Thriller", 
          "War", 
          "Western",
          "IMAX"]

movie_by_genre = {}

for genre in genres:
    movie_by_genre[genre] = []

for movieId in top_popular_movies:
    for genre in all_movies[movieId].genres:
        movie_by_genre[genre].append(all_movies[movieId])
    

In [32]:
movie_by_genre["Action"][:5]

[Matrix, The 	1999
 Action | Sci-Fi | Thriller,
 Star Wars: Episode IV - A New Hope 	1977
 Action | Adventure | Sci-Fi,
 Fight Club 	1999
 Action | Crime | Drama | Thriller,
 Jurassic Park 	1993
 Action | Adventure | Sci-Fi | Thriller,
 Star Wars: Episode V - The Empire Strikes Back 	1980
 Action | Adventure | Sci-Fi]

In [33]:
genres_picked_by_user = ["Adventure", "Horror", "Sci-Fi"]
movies_to_rate = [movie_by_genre[genre][:5] for genre in genres_picked_by_user]
movies_to_rate

[[Star Wars: Episode IV - A New Hope 	1977
  Action | Adventure | Sci-Fi,
  Jurassic Park 	1993
  Action | Adventure | Sci-Fi | Thriller,
  Lord of the Rings: The Fellowship of the Ring, The 	2001
  Adventure | Fantasy,
  Star Wars: Episode V - The Empire Strikes Back 	1980
  Action | Adventure | Sci-Fi,
  Toy Story 	1995
  Adventure | Animation | Children | Comedy | Fantasy],
 [Silence of the Lambs, The 	1991
  Crime | Horror | Thriller,
  Sixth Sense, The 	1999
  Drama | Horror | Mystery,
  Alien 	1979
  Horror | Sci-Fi,
  Aliens 	1986
  Action | Adventure | Horror | Sci-Fi,
  Shining, The 	1980
  Horror],
 [Matrix, The 	1999
  Action | Sci-Fi | Thriller,
  Star Wars: Episode IV - A New Hope 	1977
  Action | Adventure | Sci-Fi,
  Jurassic Park 	1993
  Action | Adventure | Sci-Fi | Thriller,
  Star Wars: Episode V - The Empire Strikes Back 	1980
  Action | Adventure | Sci-Fi,
  Terminator 2: Judgment Day 	1991
  Action | Sci-Fi]]

In [71]:
ratings_matrix = pd.read_csv("ratings_matrix.csv")
ratings_matrix.set_index("userId", inplace=True)

In [72]:
ratings_matrix

Unnamed: 0_level_0,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.985022,1.454468,0.606226,0.437546,2.465173,0.983519,1.791573,1.339391,2.479574,4.000000,...,1.445111,1.018099,1.505868,1.777219,1.346961,0.852462,1.666533,1.658514,1.244949,1.461946
2,4.176921,4.122690,4.489601,5.107315,2.527633,5.452974,3.131732,5.372259,2.205279,5.688760,...,0.308722,0.416266,-0.106776,-0.585876,1.221840,0.535243,0.474621,0.314362,0.389008,0.543215
3,3.808658,3.500000,3.499225,3.631146,2.832594,4.018197,4.000000,4.000000,2.429734,5.000000,...,1.598450,1.071452,1.256375,0.903713,1.752078,1.497791,1.582624,0.810243,1.050864,1.516350
4,2.974515,1.994780,0.897695,0.421673,1.702462,0.508589,2.408618,0.806692,1.522303,0.931483,...,-1.882536,-2.870765,-2.657805,-1.820206,-2.100875,-2.184592,-1.964554,-2.592997,-2.503370,-2.360141
5,3.095480,3.614925,3.702545,3.561378,2.768340,3.420135,3.545243,3.512977,2.409829,2.444453,...,0.409806,0.128775,-0.401505,-0.369330,0.113453,0.654098,0.178681,-0.355484,0.198432,-0.030628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50009,2.693937,1.500000,2.205066,2.561432,3.027944,3.224990,1.651651,3.500000,3.162321,4.506894,...,2.820010,3.168831,3.691348,3.129288,3.463420,2.463532,3.086079,3.956321,3.376890,3.578289
50010,3.864105,3.512285,4.112904,4.728486,1.644818,5.000000,2.973727,5.340085,0.985782,5.000000,...,-0.040810,-0.661745,-0.528495,-1.508528,0.559261,-0.029599,-0.143490,-1.504654,-0.945590,-0.130730
50011,2.856903,2.698948,0.232576,0.060326,2.826851,-0.454098,2.703723,-0.077098,2.303280,-0.431207,...,3.640289,3.641647,3.500000,3.516315,3.119503,3.682616,3.606480,3.686069,3.809305,3.585765
50012,3.385530,2.196609,-0.647633,-0.492729,3.207262,-0.769499,2.159226,-0.019220,2.502996,0.188484,...,2.846425,2.982817,3.081327,2.743699,2.624171,2.592957,2.569947,3.250893,3.376716,3.248194


In [59]:
ids = pd.read_csv("id_conversion_chart.csv")
ids.set_index("tmdbId", inplace=True)

In [60]:
def __convertId(tmbdId):
    return ids.loc[tmbdId].movieId


def __predictMovies(M, user_df):
    combinedM = np.vstack([M.to_numpy(), user_df.to_numpy()])
    imputer = KNNImputer(n_neighbors=10, missing_values=0)
    recommendations = imputer.fit_transform(combinedM)[-1].reshape(1, len(movies.keys()))
    recommendations_df = pd.DataFrame(recommendations, columns=M.columns, index=[0])
    sorted_recommendations = recommendations_df.loc[0].sort_values(ascending=False)
    return sorted_recommendations.index

In [63]:
user_ratings = "558:1,672:2"
ratings = user_ratings.split(",")
for i in range(len(ratings)):
    movie_rating = ratings[i].split(":")
    ratings[i] = (movie_rating[0], movie_rating[1])

ratings

[('558', '1'), ('672', '2')]

In [107]:
user_df = pd.DataFrame(columns=ratings_matrix.columns.astype(int), 
                        index=[0], 
                        data=np.zeros(len(ratings_matrix.columns)).reshape(1, 1000))

for movieId, rating in ratings:
    user_df.at[0, __convertId(int(movieId))] = rating

user_df

Unnamed: 0,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
predictions = __predictMovies(ratings_matrix, user_df)

In [109]:
predictions

Index(['4973', '296', '7361', '1089', '50', '318', '778', '608', '215', '527',
       ...
       '72378', '315', '762', '173', '6157', '1562', '1831', '204', '2701',
       '1882'],
      dtype='object', length=1000)

In [115]:
[movies[int(movieId)].to_dict() for movieId in predictions[:20]]

[{'movieId': 4973,
  'tmdbId': 194.0,
  'title': "Amelie (Fabuleux destin d'Amélie Poulain, Le) ",
  'release_date': 2001,
  'genres': ['Comedy', 'Romance'],
  'poster': 'https://media.themoviedb.org/t/p/w300_and_h450_bestv2/nSxDa3M9aMvGVLoItzWTepQ5h5d.jpg',
  'rating': 79,
  'certification': 'R',
  'runtime': '2h 2m'},
 {'movieId': 296,
  'tmdbId': 680.0,
  'title': 'Pulp Fiction ',
  'release_date': 1994,
  'genres': ['Comedy', 'Crime', 'Drama', 'Thriller'],
  'poster': 'https://media.themoviedb.org/t/p/w300_and_h450_bestv2/d5iIlFn5s0ImszYzBPb8JPIfbXD.jpg',
  'rating': 85,
  'certification': 'R',
  'runtime': '2h 34m'},
 {'movieId': 7361,
  'tmdbId': 38.0,
  'title': 'Eternal Sunshine of the Spotless Mind ',
  'release_date': 2004,
  'genres': ['Drama', 'Romance', 'Sci-Fi'],
  'poster': 'https://media.themoviedb.org/t/p/w300_and_h450_bestv2/5MwkWH9tYHv3mV9OdYTMR5qreIz.jpg',
  'rating': 81,
  'certification': 'R',
  'runtime': '1h 48m'},
 {'movieId': 1089,
  'tmdbId': 500.0,
  'title'