In [6]:
import json
import pandas as pd

# Load movies data
with open('movies.json', 'r') as file:
    movies_data = json.load(file)

# Convert movies data to a DataFrame
movies_df = pd.DataFrame(movies_data)

# Load reviews data
with open('reviews.json', 'r') as file:
    reviews_data = json.load(file)

# Convert reviews data to a DataFrame
reviews_df = pd.DataFrame(reviews_data)

# Merge the two DataFrames on 'item_id'
combined_df = pd.merge(reviews_df, movies_df, how='left', left_on='item_id', right_on='item_id')

# Display the first few rows of the combined dataset
print(combined_df.head())


   user_id  item_id  rating     timestamp                            title
0        0       50       5  881250949000                 Star Wars (1977)
1        0      172       5  881250949000  Empire Strikes Back, The (1980)
2        0      133       1  881250949000        Gone with the Wind (1939)
3      196      242       3  881250949000                     Kolya (1996)
4      186      302       3  891717742000         L.A. Confidential (1997)


In [7]:
import pandas as pd
from sklearn.cluster import MeanShift
import numpy as np

# Convert the DataFrame into a matrix of user ratings
user_movie_ratings = combined_df.pivot_table(index='user_id', columns='title', values='rating').fillna(0)

# Apply Mean Shift clustering
ms = MeanShift(bandwidth=2)  # bandwidth is a hyperparameter
ms.fit(user_movie_ratings)

# Get cluster labels for each user
labels = ms.labels_

# Number of clusters
n_clusters_ = len(np.unique(labels))

# Assign cluster labels to users
user_movie_ratings['cluster'] = labels

# Finding the cluster of user 186
user_186_cluster = user_movie_ratings.loc[186, 'cluster']

# Movies rated by user 186
rated_movies_by_186 = combined_df[combined_df['user_id'] == 186]['title']

# Filter for movies in the same cluster and not rated by user 186
recommendation_pool = user_movie_ratings[user_movie_ratings['cluster'] == user_186_cluster]
unwatched_movies = recommendation_pool.loc[:, recommendation_pool.columns != 'cluster'].mean().sort_values(ascending=False)

# Exclude the movies already watched by user 186
unwatched_movies = unwatched_movies.drop(index=rated_movies_by_186)

# Recommend the highest rated movie in the cluster that user 186 hasn't watched
recommended_movie = unwatched_movies.idxmax()

print(f"Number of estimated clusters : {n_clusters_}")
print(f"Recommended Movie for user 186: {recommended_movie}")


Number of estimated clusters : 942
Recommended Movie for user 186: Lord of Illusions (1995)
