In [2]:
import pandas as pd
from scipy.stats import pearsonr

# Step (a): Data Download and Understanding
df = pd.read_csv('ratings.csv')
print(df.head)
num_ratings = len(df)
print(f'Total number of ratings: {num_ratings}')

# Step (b): Implement User-based Collaborative Filtering with Pearson Correlation


def pearson_similarity(user1, user2):
    common_movies = set(user1.keys()) & set(user2.keys())
    if len(common_movies) < 2:
        return 0

    ratings_user1 = [user1[movie] for movie in common_movies]
    ratings_user2 = [user2[movie] for movie in common_movies]
    correlation, _ = pearsonr(ratings_user1, ratings_user2)
    return correlation

# Step (c): Implement Movie Score Prediction


def predict_score(user, movie, user_similarities):
    similar_users = [sim_user for sim_user, sim_score in sorted(user_similarities.items(
    ), key=lambda x: x[1], reverse=True) if movie in df[df['userId'] == sim_user]['movieId'].values]

    if not similar_users:
        return None  # No similar users to base prediction on

    numerator = sum(user.get(other_user, 0) * user_similarities[other_user]
                    for other_user in similar_users)
    denominator = sum(abs(user_similarities[other_user])
                      for other_user in similar_users)

    if denominator == 0:
        return None  # Avoid division by zero

    predicted_score = numerator / denominator
    return predicted_score


# Step (d): Generate Recommendations for a User
selected_user = 1
similarities = {user: pearson_similarity(df[df['userId'] == selected_user].set_index('movieId').to_dict(
)['rating'], df[df['userId'] == user].set_index('movieId').to_dict()['rating']) for user in df['userId'].unique()}
top_similar_users = [user for user, sim in sorted(similarities.items(
), key=lambda x: x[1], reverse=True) if user != selected_user][:10]

recommendations = {}
for movie in df['movieId'].unique():
    if movie not in df[df['userId'] == selected_user]['movieId'].values:
        predicted_score = predict_score(df[df['userId'] == selected_user].set_index(
            'movieId').to_dict()['rating'], movie, similarities)
        if predicted_score is not None:
            recommendations[movie] = predicted_score

top_recommendations = sorted(
    recommendations.items(), key=lambda x: x[1], reverse=True)[:10]

# Step (e): Design and Implement New Similarity Function


def custom_similarity(user1, user2):
    # Implement your custom similarity function here
    # Make sure to explain why it is useful for collaborative filtering
    pass


# Print results
print(f"Top 10 similar users for user {selected_user}: {top_similar_users}")
print(f"Top 10 recommended movies for user {selected_user}: {top_recommendations}")


<bound method NDFrame.head of         userId  movieId  rating   timestamp
0            1        1     4.0   964982703
1            1        3     4.0   964981247
2            1        6     4.0   964982224
3            1       47     5.0   964983815
4            1       50     5.0   964982931
...        ...      ...     ...         ...
100831     610   166534     4.0  1493848402
100832     610   168248     5.0  1493850091
100833     610   168250     5.0  1494273047
100834     610   168252     5.0  1493846352
100835     610   170875     3.0  1493846415

[100836 rows x 4 columns]>
Total number of ratings: 100836




Top 10 similar users for user 1: [2, 106, 146, 9, 401, 154, 157, 49, 162, 44]
Top 10 recommended movies for user 1: [(318, nan), (1704, nan), (6874, nan), (8798, nan), (46970, nan), (48516, nan), (58559, nan), (60756, nan), (68157, nan), (71535, nan)]
