In [1]:
#log into kaggle to access dataset
import os
from collections import defaultdict
import kagglehub
import numpy as np
from tqdm import tqdm

In [2]:
#please ikke offentliggør min api key. 
os.environ['KAGGLE_USERNAME'] = "marcusgaleajacobsen"
os.environ['KAGGLE_KEY'] = "32a3003f52c97053841ea46c492128dc"

# Download data

In [3]:
# Download latest version
datapath = kagglehub.dataset_download("netflix-inc/netflix-prize-data")
os.listdir(datapath)

['combined_data_1.txt',
 'combined_data_2.txt',
 'combined_data_3.txt',
 'combined_data_4.txt',
 'movie_titles.csv',
 'probe.txt',
 'qualifying.txt',
 'README']

# Custom Movie and User data structures
One data structure suggestion is to store the users and movies in their own dictionaries with the user_id and movie_id as the keys. The advantage of dictionaries is that the access time is $O(1)$ for both movies and users which is very fast. 
```python
users = {
    '123': User('123')
    ...
}
movies = {
    '1': Movie(1, 'Toy Story', '1995')
    ...
}
```

In [18]:
from src.structures import Movie, User
from src.data_methods import read_movies,read_viewers, dict_to_df, read_df

n_lines = 100000 #number of reviews to read
datafiles = ["combined_data_1.txt"]#, "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
with_tqdm = False #set to True to see progress bar (reduce speed)

movies = read_movies(datapath)
users = read_viewers(datapath, movies, datafiles = datafiles, with_tqdm= with_tqdm, n_lines=n_lines) #read only 100000 reviews for now

# Methods

In [21]:
#get ratings for a user
user = users['1488844']
print(user)
print("ratings on the format movie_id : rating ",user.ratings)

User(1488844)
ratings on the format movie_id : rating  defaultdict(<class 'float'>, {'1': 3, '8': 4, '17': 2, '30': 3, '44': 3, '58': 5, '76': 3, '80': 3, '81': 3, '83': 3, '108': 3, '111': 2, '118': 3, '143': 5, '173': 3, '175': 3, '187': 2, '189': 2, '191': 5, '195': 4, '197': 3, '199': 3, '216': 3, '232': 3, '241': 3, '268': 4, '270': 4, '273': 2, '275': 3, '285': 4, '299': 3, '305': 3, '312': 4, '313': 3, '329': 4, '330': 4, '331': 3, '334': 3, '348': 4, '353': 3, '357': 3, '361': 4, '367': 3, '381': 3, '393': 3, '406': 4, '413': 2, '418': 3, '422': 2, '429': 3, '433': 3, '445': 3, '457': 3, '468': 5, '482': 3, '483': 3, '501': 3, '516': 2, '524': 3, '547': 4, '548': 3, '550': 3, '564': 3, '569': 3, '571': 5, '577': 3, '599': 3, '607': 5, '636': 2, '658': 5, '660': 3, '662': 4, '686': 2, '692': 4, '696': 4, '705': 4, '706': 3, '708': 3, '711': 3, '746': 4, '751': 3, '758': 3, '759': 3, '798': 5, '809': 3, '817': 3, '818': 3, '819': 3, '831': 3, '833': 4, '837': 3, '851': 5, '862': 

In [22]:
#get ratings for a movie
movie = movies['1']
print(movie)
print("ratings on the format user_id : rating: ", movie.get_ratings())

Dinosaur Planet, 2003
ratings:  {'1488844': 3, '822109': 5, '885013': 4, '30878': 4, '823519': 3, '893988': 3, '124105': 4, '1248029': 3, '1842128': 4, '2238063': 3, '1503895': 4, '2207774': 5, '2590061': 3, '2442': 3, '543865': 4, '1209119': 4, '804919': 4, '1086807': 3, '1711859': 4, '372233': 5, '1080361': 3, '1245640': 3, '558634': 4, '2165002': 4, '1181550': 3, '1227322': 4, '427928': 4, '814701': 5, '808731': 4, '662870': 5, '337541': 5, '786312': 3, '1133214': 4, '1537427': 4, '1209954': 5, '2381599': 3, '525356': 2, '1910569': 4, '2263586': 4, '2421815': 2, '1009622': 1, '1481961': 2, '401047': 4, '2179073': 3, '1434636': 3, '93986': 5, '1308744': 5, '2647871': 4, '1905581': 5, '2508819': 3, '1578279': 1, '1159695': 4, '2588432': 3, '2423091': 3, '470232': 4, '2148699': 2, '1342007': 3, '466135': 4, '2472440': 3, '1283744': 3, '1927580': 4, '716874': 5, '4326': 4, '1546549': 5, '1493697': 1, '880166': 5, '535396': 2, '494609': 4, '1961619': 5, '883478': 4, '793564': 4, '1567202

# Normalization

In [23]:
# Normalize ratings for each user to have mean 0 and variance 1
for user in users.values():
    user.normalize_ratings()

# Similarity

In [24]:
#similarity between two users
user1 = users['1488844']
user2 = users['822109']
similarity = user1.similarity(user2, method='pearson')
print(f"similarity between {user1} and {user2}: {similarity}")

similarity between User(1488844) and User(822109): 0.06181501744457733


In [25]:
#similarity between two movies
movie1 = movies['1']
movie2 = movies['2']
similarity = movie1.similarity(movie2, method='pearson')
print(f"similarity between {movie1} and {movie2}: {similarity:.2f}")

similarity between Dinosaur Planet, 2003 and Isle of Man TT 2004 Review, 2004: 0.67


# Alternative data structure: DataFrame
Another suggestion is to use a pandas DataFrame to store the data. The advantage of using a DataFrame is that it is very fast and has a lot of built-in functionality. (It's also more memory efficient than dictionaries). The downside is that access time for users/movies is $O(n)$ which is slower than a dictionary (where $n$ is the number of rows in the DataFrame).

In [13]:
# df = read_df(datapath, datafiles=datafiles, n_lines=100000)
# df

In [14]:
# #all ratings for user 1488844
# df[df.user_id == "1488844"]

In [15]:
# #all ratings for movie 1
# df[df.movie_id == "1"]

In [16]:
#get all unique user id # TOOOOO SLOW
# unique_users = df.user_id.unique()
# for i in tqdm(range(len(unique_users))):
#     user = unique_users[i]
#     df_user = df[df.user_id == user]

In [17]:
# unique_users = df.user_id.unique()
# # for i in tqdm(range(len(unique_users))):
# #     user = unique_users[i]
# #     info_user = users[user]