In [1]:
import pandas as pd
import dask.dataframe as dd

# Define file paths
ratings_path = '/content/ratings.csv'
movies_path = '/content/movies.csv'

# Load the dataset with specified data types, handling missing values appropriately
ratings = dd.read_csv(ratings_path, dtype={'timestamp': 'float64'})
movies = dd.read_csv(movies_path, on_bad_lines='skip')

In [2]:
# Compute and show the first few rows to understand the data structure
print("Ratings Sample:")
ratings.head()

Ratings Sample:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880000.0
1,1,306,3.5,1147869000.0
2,1,307,5.0,1147869000.0
3,1,665,5.0,1147879000.0
4,1,899,3.5,1147869000.0


In [3]:
print("Movies Sample:")
movies.head()

Movies Sample:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
print("Movies Description:")
print(movies.describe(include='all').compute())  # Include 'all' to get statistics for non-numeric columns as well

Movies Description:
              movieId     title genres
unique            NaN     62325   1639
count    62423.000000     62423  62423
top               NaN  9 (2009)  Drama
freq              NaN         2   9056
mean    122220.387646      <NA>   <NA>
std      63264.744844      <NA>   <NA>
min          1.000000      <NA>   <NA>
25%      82146.500000      <NA>   <NA>
50%     138022.000000      <NA>   <NA>
75%     173222.000000      <NA>   <NA>
max     209171.000000      <NA>   <NA>


In [5]:
# General statistics
print("Ratings Description:")
print(ratings.describe().compute())

Ratings Description:
              userId        movieId         rating     timestamp
count  169209.000000  169209.000000  169209.000000  1.692080e+05
mean      624.687983   20015.823502       3.571042  1.196380e+09
std       341.479634   37736.739836       1.043787  2.334153e+08
min         1.000000       1.000000       0.500000  8.280963e+08
25%       333.000000    1090.000000       3.000000  9.807988e+08
50%       639.000000    2712.000000       4.000000  1.162684e+09
75%       904.000000    7371.000000       4.000000  1.439794e+09
max      1203.000000  208002.000000       5.000000  1.574254e+09


In [6]:
# Merge the datasets on the 'movieId' column
merged_df = dd.merge(movies, ratings, on='movieId', how='inner')

In [7]:
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141416000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492500.0


In [8]:
# Calculate total number of ratings for each movie
ratings_count = merged_df.groupby('movieId').size()

# Convert Series to DataFrame without using the 'name' parameter in reset_index
ratings_count_df = ratings_count.reset_index()
ratings_count_df.columns = ['movieId', 'totalRatingCount']  # Manually renaming the columns

# Merge this count back into the original merged DataFrame
merged_df = dd.merge(merged_df, ratings_count_df, on='movieId', how='left')

# Verify the changes
print(merged_df.head())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating     timestamp  totalRatingCount  
0       2     3.5  1.141416e+09               414  
1       3     4.0  1.439472e+09               414  
2       4     3.0  1.573944e+09               414  
3       5     4.0  8.586259e+08               414  
4       8     4.0  8.904925e+08               414  


In [9]:
# Filter to keep only those movieIds with at least 50 ratings
filtered_movieIds = ratings_count[ratings_count >= 50].index

# Compute the Dask object to a Pandas object
computed_ids = filtered_movieIds.compute()
filtered_df = merged_df[merged_df['movieId'].isin(computed_ids)]

# Verify the changes
print(filtered_df.head())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating     timestamp  totalRatingCount  
0       2     3.5  1.141416e+09               414  
1       3     4.0  1.439472e+09               414  
2       4     3.0  1.573944e+09               414  
3       5     4.0  8.586259e+08               414  
4       8     4.0  8.904925e+08               414  


In [10]:
from scipy.sparse import csr_matrix

# Convert 'userId' to a categorical data type with known categories
filtered_df['userId'] = filtered_df['userId'].astype('category').cat.as_known()

# Remove duplicates
filtered_df = filtered_df.drop_duplicates(subset=['userId', 'title'])

# Create a pivot table
pivot_table = filtered_df.pivot_table(index='title', columns='userId', values='rating')

# Convert the pivot table to a sparse matrix
sparse_matrix = csr_matrix(pivot_table.values)

# Print the pivot table and the sparse matrix shape
print(pivot_table.head())
print('Shape of the sparse matrix:', sparse_matrix.shape)

userId                                              1   2   3   4   5    6  \
title                                                                        
(500) Days of Summer (2009)                       NaN NaN NaN NaN NaN  NaN   
10 Things I Hate About You (1999)                 NaN NaN NaN NaN NaN  NaN   
101 Dalmatians (1996)                             NaN NaN NaN NaN NaN  NaN   
101 Dalmatians (One Hundred and One Dalmatians... NaN NaN NaN NaN NaN  NaN   
12 Angry Men (1957)                               NaN NaN NaN NaN NaN  5.0   

userId                                              7    8    9   10  ...  \
title                                                                 ...   
(500) Days of Summer (2009)                       NaN  NaN  NaN  NaN  ...   
10 Things I Hate About You (1999)                 NaN  NaN  NaN  3.0  ...   
101 Dalmatians (1996)                             NaN  NaN  5.0  NaN  ...   
101 Dalmatians (One Hundred and One Dalmatians... NaN  NaN  NaN  2.0

In [11]:
# Replace all NaN values in the DataFrame with zero and assign back to the same DataFrame
pivot_table = pivot_table.fillna(0)

print(pivot_table.head())

userId                                               1    2    3    4    5  \
title                                                                        
(500) Days of Summer (2009)                        0.0  0.0  0.0  0.0  0.0   
10 Things I Hate About You (1999)                  0.0  0.0  0.0  0.0  0.0   
101 Dalmatians (1996)                              0.0  0.0  0.0  0.0  0.0   
101 Dalmatians (One Hundred and One Dalmatians...  0.0  0.0  0.0  0.0  0.0   
12 Angry Men (1957)                                0.0  0.0  0.0  0.0  0.0   

userId                                               6    7    8    9   10  \
title                                                                        
(500) Days of Summer (2009)                        0.0  0.0  0.0  0.0  0.0   
10 Things I Hate About You (1999)                  0.0  0.0  0.0  0.0  3.0   
101 Dalmatians (1996)                              0.0  0.0  0.0  5.0  0.0   
101 Dalmatians (One Hundred and One Dalmatians...  0.0  0.0  0.

In [79]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

# Assuming 'ratings_df' is a Pandas DataFrame loaded with 'userId', 'itemId', and 'rating'
# Count the number of ratings each item has received
ratings_count = filtered_df['title'].value_counts()

# Create a set of popular items that have been rated by at least 50 users
popular_items_set = set(ratings_count[ratings_count >= 50].index)

# Filter the ratings DataFrame to include only items in the popular items set
filtered_ratings_df = filtered_df[filtered_df['title'].apply(lambda x: x in popular_items_set)]

# Load the filtered dataset into Surprise
reader = Reader(rating_scale=(filtered_ratings_df['rating'].min(), filtered_ratings_df['rating'].max()))
data = Dataset.load_from_df(filtered_ratings_df[['userId', 'title', 'rating']], reader)

# Prepare the data and train the model
trainset = data.build_full_trainset()
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('title', 'bool'))



Computing the cosine similarity matrix...
Done computing similarity matrix.


In [None]:
# Get predictions for a specific user
user_id = 1196
items_rated_by_user = filtered_ratings_df[filtered_ratings_df['userId'] == user_id]['title'].unique()
testset = [[user_id, item_id, 0] for item_id in items_rated_by_user if item_id in popular_items_set]
predictions = algo.test(testset)

In [80]:
top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]

# Display the top 10 predictions
for prediction in top_predictions:
    print(f"Item: {prediction.iid}, Predicted rating: {prediction.est}")

Item: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964), Predicted rating: 4.213946454385449
Item: No Country for Old Men (2007), Predicted rating: 4.176157100614318
Item: Toy Story (1995), Predicted rating: 4.163341237341798
Item: Star Wars: Episode V - The Empire Strikes Back (1980), Predicted rating: 4.149940335836967
Item: Shaun of the Dead (2004), Predicted rating: 4.149199822341015
Item: Aladdin (1992), Predicted rating: 4.137745943428514
Item: Star Wars: Episode IV - A New Hope (1977), Predicted rating: 4.124899011172997
Item: WALL·E (2008), Predicted rating: 4.113824670408587
Item: Fargo (1996), Predicted rating: 4.1134463925279565
Item: 2001: A Space Odyssey (1968), Predicted rating: 4.112915396952663
