In [7]:
import pandas as pd
import matplotlib.pyplot as plt

# Labb 1 AI Machine learning advanced

Need to filter the data first since the dataset is 900mb. Looking for different ideas.

In [8]:
#This results in a more limited but reliable dataset, it can still become more restrictive and is known as cold start
min_ratings_per_movie = 150      
min_ratings_per_user = 50

#This would result in a much wider dataset but risk overwhelming the user with a plethora of unknown movies
# min_ratings_per_movie = 10
# min_ratings_per_user = 10

dtypes = {"userId": "int32", "movieId": "int32", "rating": "float32"}

movie_counts = pd.Series(dtype=int)
user_counts = pd.Series(dtype=int)

chunk_size = 10000  


for chunk in pd.read_csv("data/ratings.csv", chunksize=chunk_size, dtype=dtypes):
    movie_counts = movie_counts.add(
        chunk["movieId"].value_counts(), fill_value=0)
    user_counts = user_counts.add(chunk["userId"].value_counts(), fill_value=0)


filter_movies = movie_counts[movie_counts >= min_ratings_per_movie].index
filter_users = user_counts[user_counts >= min_ratings_per_user].index


chunks_list = []  
for chunk in pd.read_csv("data/ratings.csv", chunksize=chunk_size, dtype=dtypes):
    chunk_filtered = chunk[chunk["movieId"].isin(
        filter_movies) & chunk["userId"].isin(filter_users)]
    chunks_list.append(chunk_filtered)

# Concatenate the list of filtered chunks into one DataFrame
df_filtered = pd.concat(chunks_list, ignore_index=True)

In [9]:
df_filtered

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
29199598,330975,8340,2.0,1091583256
29199599,330975,8493,2.5,1091585709
29199600,330975,8622,4.0,1091581777
29199601,330975,8665,3.0,1091581765


In [6]:
# Descriptive statistics for the numeric columns
print(df_filtered.describe())

# Count of unique movies and users
unique_movies = df_filtered['movieId'].nunique()
unique_users = df_filtered['userId'].nunique()
print(f'Number of unique movies: {unique_movies}')
print(f'Number of unique users: {unique_users}')

# Distribution of ratings
rating_counts = df_filtered['rating'].value_counts().sort_index()
print(rating_counts)

# Most rated movies
most_rated_movies = df_filtered['movieId'].value_counts().head(10)
print(most_rated_movies)

# Highest rated movies (with at least 300 ratings)
average_ratings = df_filtered.groupby('movieId')['rating'].mean()
average_ratings = average_ratings.loc[df_filtered['movieId'].value_counts(
) >= 300]
highest_rated_movies = average_ratings.sort_values(ascending=False).head(10)
print(highest_rated_movies)

# Ratings per user
ratings_per_user = df_filtered['userId'].value_counts()
print(ratings_per_user.describe())

             userId       movieId        rating     timestamp
count  2.528930e+07  2.528930e+07  2.528930e+07  2.528930e+07
mean   1.653248e+05  2.680837e+04  3.511902e+00  1.279097e+09
std    9.538847e+04  4.674314e+04  1.045412e+00  2.452736e+08
min    7.000000e+00  1.000000e+00  5.000000e-01  7.896520e+08
25%    8.280300e+04  1.283000e+03  3.000000e+00  1.069125e+09
50%    1.658530e+05  3.481000e+03  3.500000e+00  1.266036e+09
75%    2.474510e+05  3.772700e+04  4.000000e+00  1.499614e+09
max    3.309750e+05  2.868970e+05  5.000000e+00  1.689842e+09
Number of unique movies: 7876
Number of unique users: 82050
rating
0.5     400193
1.0     742567
1.5     438838
2.0    1656541
2.5    1420831
3.0    4783396
3.5    3612543
4.0    6637324
4.5    2362291
5.0    3234776
Name: count, dtype: int64
movieId
356     58969
2571    58204
296     57283
318     55640
593     53478
260     53212
1196    49941
2959    48505
480     48252
4993    47952
Name: count, dtype: int64
movieId
171011    4.44463