In [2]:
import zipfile
import pandas as pd
import movie_utils
from tqdm import tqdm
import numpy as np

%load_ext autoreload
%autoreload 2

In [3]:
# data from: https://grouplens.org/datasets/movielens/

files = {}

with zipfile.ZipFile("ml-32m.zip", 'r') as zip:
    zip_contents = zip.namelist()
    for file_name in zip_contents:
        if file_name.endswith('.csv'):
            print("Downloading " + file_name + "...")
            with zip.open(file_name) as file:
                df = pd.read_csv(file)
                files[file_name[7:-4]] = df

Downloading ml-32m/tags.csv...
Downloading ml-32m/links.csv...
Downloading ml-32m/ratings.csv...
Downloading ml-32m/movies.csv...


In [4]:
files["links"] = files["links"][["movieId", "tmdbId"]]
files["links"].head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [5]:
files["ratings"] = files["ratings"][["userId", "movieId", "rating"]]
files["ratings"].head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [6]:
files["movies"] = files["movies"].merge(files["links"], left_on="movieId", right_on="movieId", how="inner")
files["movies"].head()

Unnamed: 0,movieId,title,genres,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy,11862.0


In [7]:
movies = {}

for movie in files["movies"].iterrows():
    movie = movie[1]
    movies[movie.movieId] = movie_utils.Movie(movie.movieId, movie.tmdbId, movie.title, movie.genres)

In [8]:
user = files["ratings"][files["ratings"]["userId"] == 2024][:5]
for movie in user.iterrows():
    if movie[1].movieId in movies.keys():
        print("-----------------------------------------------------------------------------------------------")
        print(movies[movie[1].movieId])
        print("Rating: " + str(movie[1].rating))

-----------------------------------------------------------------------------------------------
Clerks 	1994
Comedy
Rating: 1.0
-----------------------------------------------------------------------------------------------
Exotica 	1994
Drama
Rating: 2.0
-----------------------------------------------------------------------------------------------
Red Firecracker, Green Firecracker (Pao Da Shuang Deng) 	1994
Drama
Rating: 1.0
-----------------------------------------------------------------------------------------------
Maltese Falcon, The 	1941
Film-Noir | Mystery
Rating: 5.0
-----------------------------------------------------------------------------------------------
Gone with the Wind 	1939
Drama | Romance | War
Rating: 5.0


In [9]:
popular_movies = files["ratings"][["movieId", "userId"]].groupby("movieId").count()
popular_movies = popular_movies.sort_values(by=["userId"], ascending=False)
popular_movies = popular_movies.index.values.tolist()

num_movies = 1000
top_popular_movies = set(popular_movies[:num_movies])

In [67]:
total = len(files["ratings"])
ratings = np.empty((total, 3), dtype=np.int32)

i = 0
for rating in tqdm(files["ratings"].iterrows(), total=total):
    if rating[1].movieId in top_popular_movies:
        ratings[i, 0] = rating[1].userId
        ratings[i, 1] = rating[1].movieId
        ratings[i, 2] = rating[1].rating
        i += 1

ratings = ratings[:i]
np.savez_compressed("ratings.npz", ratings=ratings)

  2%|▏         | 556726/32000204 [00:35<33:44, 15528.13it/s]


KeyboardInterrupt: 

In [10]:
ratings = np.load("ratings.npz")["ratings"]

In [11]:
ratings_df = pd.DataFrame(ratings, columns=["userId", "movieId", "rating"], index=[i for i in range(len(ratings))])
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4
1,1,25,1
2,1,29,2
3,1,32,5
4,1,34,2


In [12]:
ratings_df.pivot_table(index="userId", columns="movieId", values="rating")

movieId,1,2,3,5,6,7,10,11,16,17,...,168252,171763,174055,176371,177765,187593,195159,202439,204698,207313
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,3.0,,,,,4.0,4.0,,5.0,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,4.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200944,4.0,,,,,,,,,,...,,,,,,,,,,
200945,,,,,,,,,,,...,,,0.0,,,,,,,
200946,,,,,4.0,,5.0,5.0,,4.0,...,,,,,,,,,,
200947,4.0,,,,,,,,,,...,,,,,,,,,,
