In [1]:
#log into kaggle to access dataset
import os
from collections import defaultdict
import kagglehub
import numpy as np
from tqdm import tqdm

In [2]:
#please ikke offentliggør min api key. 
os.environ['KAGGLE_USERNAME'] = "marcusgaleajacobsen"
os.environ['KAGGLE_KEY'] = "32a3003f52c97053841ea46c492128dc"

# Download data

In [3]:
# Download latest version
datapath = kagglehub.dataset_download("netflix-inc/netflix-prize-data")
os.listdir(datapath)

['combined_data_1.txt',
 'combined_data_2.txt',
 'combined_data_3.txt',
 'combined_data_4.txt',
 'movie_titles.csv',
 'probe.txt',
 'qualifying.txt',
 'README']

# Custom Movie and User data structures
One data structure suggestion is to store the users and movies in their own dictionaries with the user_id and movie_id as the keys. The advantage of dictionaries is that all entries have an access time of $O(1)$. 
```python
users = {
    '123': User('123')
    ...
}
movies = {
    '1': Movie(1, 'Toy Story', '1995')
    ...
}
```

In [4]:
from src.structures import Movie, User
from src.data_methods import read_movies,read_viewers, dict_to_df, read_df

n_lines = 10000 #number of reviews to read
datafiles = ["combined_data_1.txt"]#, "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
with_tqdm = False #set to True to see progress bar (reduce speed)

movies = read_movies(datapath)
users = read_viewers(datapath, movies, datafiles = datafiles, with_tqdm= with_tqdm, n_lines=n_lines) #read only 100000 reviews for now

# Methods

In [5]:
#get ratings for a user
user = users['1488844']
print(user)
print("ratings on the format movie_id : rating ",user.get_ratings())

User(1488844)
ratings on the format movie_id : rating  defaultdict(<class 'float'>, {'1': 3, '8': 4})


In [6]:
#get ratings for a movie
movie = movies['1']
print(movie)
print("ratings on the format user_id : rating: ", movie.get_ratings())

Dinosaur Planet, 2003
ratings on the format user_id : rating:  {'1488844': 3, '822109': 5, '885013': 4, '30878': 4, '823519': 3, '893988': 3, '124105': 4, '1248029': 3, '1842128': 4, '2238063': 3, '1503895': 4, '2207774': 5, '2590061': 3, '2442': 3, '543865': 4, '1209119': 4, '804919': 4, '1086807': 3, '1711859': 4, '372233': 5, '1080361': 3, '1245640': 3, '558634': 4, '2165002': 4, '1181550': 3, '1227322': 4, '427928': 4, '814701': 5, '808731': 4, '662870': 5, '337541': 5, '786312': 3, '1133214': 4, '1537427': 4, '1209954': 5, '2381599': 3, '525356': 2, '1910569': 4, '2263586': 4, '2421815': 2, '1009622': 1, '1481961': 2, '401047': 4, '2179073': 3, '1434636': 3, '93986': 5, '1308744': 5, '2647871': 4, '1905581': 5, '2508819': 3, '1578279': 1, '1159695': 4, '2588432': 3, '2423091': 3, '470232': 4, '2148699': 2, '1342007': 3, '466135': 4, '2472440': 3, '1283744': 3, '1927580': 4, '716874': 5, '4326': 4, '1546549': 5, '1493697': 1, '880166': 5, '535396': 2, '494609': 4, '1961619': 5, '88

In [31]:
movies

defaultdict(src.structures.Movie,
            {'1': Movie(1, Dinosaur Planet, 2003) seen by 547 users,
             '2': Movie(2, Isle of Man TT 2004 Review, 2004) seen by 145 users,
             '3': Movie(3, Character, 1997) seen by 2012 users,
             '4': Movie(4, Paula Abdul's Get Up & Dance, 1994) seen by 142 users,
             '5': Movie(5, The Rise and Fall of ECW, 2004) seen by 1140 users,
             '6': Movie(6, Sick, 1997) seen by 1019 users,
             '7': Movie(7, 8 Man, 1992) seen by 93 users,
             '8': Movie(8, What the #$*! Do We Know!?, 2004) seen by 4894 users,
             '9': Movie(9, Class of Nuke 'Em High 2, 1991) seen by 0 users,
             '10': Movie(10, Fighter, 2001) seen by 0 users,
             '11': Movie(11, Full Frame: Documentary Shorts, 1999) seen by 0 users,
             '12': Movie(12, My Favorite Brunette, 1947) seen by 0 users,
             '13': Movie(13, Lord of the Rings: The Return of the King: Extended Edition: Bonus Mat

# Normalization

In [7]:
# Normalize ratings for each user to have mean 0 and variance 1
for user in users.values():
    user.normalize_ratings()

In [8]:
users["2442"].ratings

defaultdict(float, {'1': 0.0})

# Add Genres to movies

In [45]:
import pandas as pd

splits = {'train': 'train.csv', 'validation': 'validation.csv', 'test': 'test.csv'}
genres = pd.read_csv("hf://datasets/jquigl/imdb-genres/" + splits["train"])

# Split the 'movie title - year' column into 'movie title' and 'year'
genres[['movie title', 'year']] = genres['movie title - year'].str.rsplit(' - ', n=1, expand=True)

# Drop the original 'movie title - year' column
genres.drop(columns=['movie title - year'], inplace=True)

#set movie title as index
genres.set_index('movie title', inplace=True)

#remove duplicate movies
genres = genres[~genres.index.duplicated(keep='first')]

# Display the genres DataFrame
genres

Unnamed: 0_level_0,genre,expanded-genres,rating,description,year
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Flaming Ears,Fantasy,"Fantasy, Sci-Fi",6.0,Flaming Ears is a pop sci-fi lesbian fantasy f...,1992
Jeg elsker dig,Romance,"Comedy, Drama, Romance",5.8,Six people - three couples - meet at random at...,1957
Povjerenje,Thriller,Thriller,,"In a small unnamed town, in year 2025, Krsto w...",2021
Gulliver Returns,Fantasy,"Animation, Adventure, Family",4.4,The legendary Gulliver returns to the Kingdom ...,2021
Prithvi Vallabh,Biography,"Biography, Drama, Romance",,"Seminal silent historical film, the story feat...",1924
...,...,...,...,...,...
Paradise for Three,Romance,"Romance, Comedy",7.0,"A wealthy business man, out of touch with real...",1938
Road to Defiance,Action,"Action, History",,Two Scottish soldiers - Hamish McNeill & Gordo...,
Under Your Hat,Adventure,"Adventure, Comedy",5.5,Inept Secret Agent on a job in the South of Fr...,1940
The Shadow Thing,Horror,Horror,,The Phantom and Persephone.,


In [None]:
for i in tqdm(range(1, len(movies)+1)):
    movie = movies[str(i)]

    try:
        if not genres.loc[movie.title].empty:
            if genres.loc[movie.title].year == movie.year:
                if type(genres.loc[movie.title]) == pd.Series:
                    movie.genres = genres.loc[movie.title]['expanded-genres'].split(", ")
                else:
                    movie.genres = genres.loc[movie.title]['expanded-genres'].iloc[0].split(", ")
    except KeyError:
        pass

100%|██████████| 17770/17770 [00:03<00:00, 5775.73it/s]


['Crime', 'Drama', 'Mystery']

In [47]:
g_count = 0
for movie in movies.values():
    if movie.genres:
        g_count += 1

print(f"Movies with genres: {g_count}/{len(movies)}")

Movies with genres: 7440/17770


# Similarity

In [None]:
#similarity between two users
user1 = users['1488844']
user2 = users['822109']
similarity = user1.similarity(user2, method='pearson')
print(f"similarity between {user1} and {user2}: {similarity}")

similarity between User(1488844) and User(822109): -0.0


In [None]:
#similarity between two movies
movie1 = movies['1']
movie2 = movies['2']
similarity = movie1.similarity(movie2, method='pearson')
print(f"similarity between {movie1} and {movie2}: {similarity:.2f}")

similarity between Dinosaur Planet, 2003 and Isle of Man TT 2004 Review, 2004: -0.11


# Alternative data structure: DataFrame
Another suggestion is to use a pandas DataFrame to store the data. The advantage of using a DataFrame is that it is very fast and has a lot of built-in functionality. (It's also more memory efficient than dictionaries). The downside is that access time for users/movies is $O(n)$ which is slower than a dictionary (where $n$ is the number of rows in the DataFrame).

In [None]:
# df = read_df(datapath, datafiles=datafiles, n_lines=100000)
# df

In [None]:
# #all ratings for user 1488844
# df[df.user_id == "1488844"]

In [None]:
# #all ratings for movie 1
# df[df.movie_id == "1"]

In [None]:
#get all unique user id # TOOOOO SLOW
# unique_users = df.user_id.unique()
# for i in tqdm(range(len(unique_users))):
#     user = unique_users[i]
#     df_user = df[df.user_id == user]

In [None]:
# unique_users = df.user_id.unique()
# # for i in tqdm(range(len(unique_users))):
# #     user = unique_users[i]
# #     info_user = users[user]