In [11]:
import pandas as pd

# Step 1: Load the data (adjust path if needed)
data_path = 'C:/Users/kelet/Downloads/ml-32m/'

ratings = pd.read_csv(data_path + 'ratings.csv')

print("Original ratings shape:", ratings.shape)

# Step 2: Select top 10,000 users and movies by number of ratings
top_users = ratings['userId'].value_counts().nlargest(10000).index
top_movies = ratings['movieId'].value_counts().nlargest(10000).index

# Step 3: Filter the ratings DataFrame to only include these users and movies
ratings_subset = ratings[(ratings['userId'].isin(top_users)) & (ratings['movieId'].isin(top_movies))]

print("Filtered ratings shape:", ratings_subset.shape)

# Step 4: Create the user-item matrix (pivot table), filling missing ratings with 0
user_movie_matrix = ratings_subset.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

print("User-Item matrix shape:", user_movie_matrix.shape)


Original ratings shape: (32000204, 4)
Filtered ratings shape: (9763311, 4)
User-Item matrix shape: (10000, 10000)


In [12]:
print(user_movie_matrix.index[:10])  # print first 10 user IDs in matrix


Index([10, 28, 70, 109, 187, 188, 228, 239, 240, 245], dtype='int64', name='userId')


In [13]:
valid_user_id = 10  # or 28, 70, etc.

top_similar_users = get_top_similar_users(valid_user_id, user_similarity, user_index_map, top_n=5)
print(f"Top 5 similar users to user {valid_user_id} (by matrix index):", top_similar_users)


Top 5 similar users to user 10 (by matrix index): [9580, 9599, 6330, 7363, 8007]


In [14]:
def recommend_movies(user_id, user_movie_matrix, user_similarity, user_index_map, top_n=10):
    user_idx = user_index_map[user_id]
    
    # Get similarity scores for the target user with all others
    sim_scores = user_similarity[user_idx]
    
    # Get the target user's ratings
    user_ratings = user_movie_matrix.iloc[user_idx]
    
    # Find movies user hasn't rated (rating = 0)
    unrated_movies = user_ratings[user_ratings == 0].index
    
    # Calculate weighted ratings from similar users
    weighted_scores = {}
    for movie in unrated_movies:
        # Get ratings of other users for this movie
        movie_ratings = user_movie_matrix[movie]
        
        # Compute weighted average of ratings using similarity scores
        weighted_sum = np.dot(sim_scores, movie_ratings)
        sim_sum = np.sum(sim_scores)
        
        if sim_sum > 0:
            weighted_scores[movie] = weighted_sum / sim_sum
        else:
            weighted_scores[movie] = 0
    
    # Sort movies by weighted score
    sorted_movies = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return top n movie IDs as recommendations
    return [movie for movie, score in sorted_movies[:top_n]]

# Example usage
recommended_movies = recommend_movies(valid_user_id, user_movie_matrix, user_similarity, user_index_map, top_n=10)
print("Recommended movie IDs:", recommended_movies)


Recommended movie IDs: [2858, 1265, 1704, 608, 1136, 293, 1732, 4886, 1213, 3147]


In [15]:
import pandas as pd

# Load movies data (adjust path as needed)
movies = pd.read_csv('C:/Users/kelet/Downloads/ml-32m/movies.csv')

# Your recommended movie IDs (example)
recommended_movies = [2858, 1265, 1704, 608, 1136, 293, 1732, 4886, 1213, 3147]

# Get movie titles
recommended_titles = movies[movies['movieId'].isin(recommended_movies)]['title'].values

print("Recommended movies for user", valid_user_id)
for title in recommended_titles:
    print(title)


Recommended movies for user 10
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
Fargo (1996)
Monty Python and the Holy Grail (1975)
Goodfellas (1990)
Groundhog Day (1993)
Good Will Hunting (1997)
Big Lebowski, The (1998)
American Beauty (1999)
Green Mile, The (1999)
Monsters, Inc. (2001)
