# Creation of a subsample from the MovieLens 25M dataset

In [2]:
from torch_geometric.data import download_url, extract_zip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [3]:
# download the 25M dataset
ml_25m_url = 'https://files.grouplens.org/datasets/movielens/ml-25m.zip'
extract_zip(download_url(ml_25m_url, '.'), '.')

Using existing file ml-25m.zip
Extracting .\ml-25m.zip


In [4]:
rating_path = './ml-25m/ratings.csv'
ratings = pd.read_csv(rating_path)

In [5]:
len(ratings)

25000095

In [6]:
# Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
# We convert to a readable timestamp
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [7]:
ratings['year'] = ratings['timestamp'].dt.year
ratings['month'] = ratings['timestamp'].dt.month
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,month
0,1,296,5.0,2006-05-17 15:34:04,2006,5
1,1,306,3.5,2006-05-17 12:26:57,2006,5
2,1,307,5.0,2006-05-17 12:27:08,2006,5
3,1,665,5.0,2006-05-17 15:13:40,2006,5
4,1,899,3.5,2006-05-17 12:21:50,2006,5


In [8]:
#plt.hist(ratings['year'])

In [9]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
year         0
month        0
dtype: int64

In [10]:
ratings.duplicated().sum()

0

In [11]:
ratings_2018_2019 = ratings[ratings['year'] >= 2018].copy() # We take a sample of ratings from years 2018-2019
len(ratings_2018_2019), len(ratings)

(2511395, 25000095)

In [12]:
user_counts = ratings_2018_2019.userId.value_counts()
print('Total users in sample 2018-2019: ', len(user_counts))
print('Users with less than 3 ratings: ', len(user_counts[user_counts < 3]))
print('Users with less than 5 ratings: ', len(user_counts[user_counts < 5]))
print('Users with less than 10 ratings: ', len(user_counts[user_counts < 10]))
print('Users with less than 20 ratings: ', len(user_counts[user_counts < 20]))

Total users in sample 2018-2019:  18512
Users with less than 3 ratings:  481
Users with less than 5 ratings:  824
Users with less than 10 ratings:  1507
Users with less than 20 ratings:  2448


In [13]:
movie_counts = ratings_2018_2019.movieId.value_counts()
print('Total movies in sample 2018-2019: ', len(movie_counts))
print('Movies with less than 3 ratings: ', len(movie_counts[movie_counts < 3]))
print('Movies with less than 5 ratings: ', len(movie_counts[movie_counts < 5]))
print('Movies with less than 10 ratings: ', len(movie_counts[movie_counts < 10]))
print('Movies with less than 20 ratings: ', len(movie_counts[movie_counts < 20]))

Total movies in sample 2018-2019:  49427
Movies with less than 3 ratings:  21967
Movies with less than 5 ratings:  30024
Movies with less than 10 ratings:  37115
Movies with less than 20 ratings:  41210


In [14]:
# We are not removing users and movies with few ratings because often in real settings, we have users and movies with few ratings.
# We want to see how the models perform in such scenarios.
# In addition, data pruning should always be done carefully and is recommended to be avoided in recsys: https://ceur-ws.org/Vol-2431/paper6.pdf

In [15]:
userids = ratings_2018_2019.userId.unique()
np.random.seed(0)
sample_1000 = np.random.choice(userids, size=1000, replace=False) # Let's keep random 1000 users
print(f'There are {len(userids)} users active in 2018-2019. We randomly sample {len(sample_1000)} of them.')

There are 18512 users active in 2018-2019. We randomly sample 1000 of them.


In [16]:
subsample = ratings_2018_2019[ratings_2018_2019.userId.isin(sample_1000)]
print(f'After sampling 1000 random users. We have {len(subsample)} ratings and {len(subsample.movieId.unique())} unique movies.')

After sampling 1000 random users. We have 126687 ratings and 13694 unique movies.


In [17]:
print('Minimum number of ratings a user has in this subsample: ', subsample.userId.value_counts().min())
print('Maximum number of ratings a user has in this subsample: ', subsample.userId.value_counts().max())
print('Minimum number of ratings a movie has in this subsample: ', subsample.movieId.value_counts().min())
print('Maximum number of ratings a movie has in this subsample: ', subsample.movieId.value_counts().max())

Minimum number of ratings a user has in this subsample:  1
Maximum number of ratings a user has in this subsample:  2941
Minimum number of ratings a movie has in this subsample:  1
Maximum number of ratings a movie has in this subsample:  443


In [18]:
user_counts = subsample.userId.value_counts()
print('Total users: ', len(user_counts))
print('Users with less than 3 ratings: ', len(user_counts[user_counts < 3]))
print('Users with less than 10 ratings: ', len(user_counts[user_counts < 10]))
print('Users with more than 50 ratings: ', len(user_counts[user_counts > 50]))

Total users:  1000
Users with less than 3 ratings:  21
Users with less than 10 ratings:  71
Users with more than 50 ratings:  573


In [19]:
movie_counts = subsample.movieId.value_counts()
print('Total movies: ', len(movie_counts))
print('Movies with less than 3 ratings: ', len(movie_counts[movie_counts < 3]))
print('Movies with less than 5 ratings: ', len(movie_counts[movie_counts < 5]))
print('Movies with less than 10 ratings: ', len(movie_counts[movie_counts < 10]))
print('Movies with more than 50 ratings: ', len(movie_counts[movie_counts > 50]))
# A lot of movies that only one or a few users have rated

Total movies:  13694
Movies with less than 3 ratings:  8369
Movies with less than 5 ratings:  10016
Movies with less than 10 ratings:  11394
Movies with more than 50 ratings:  573


In [20]:
subsample.to_csv('./data/ratings_subsample_120K.csv', index=False)