In [26]:
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

# Load the datasets
ratings = pd.read_csv('../input/movies/ratings.csv')
movies = pd.read_csv('../input/movies/movies.csv')

# Display first few rows of ratings and movies datasets
print("Ratings dataset:")
print(ratings.head())

print("\nMovies dataset:")
print(movies.head())

Ratings dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [27]:
# Merge the datasets on 'movieId'
data = pd.merge(ratings, movies, on='movieId')

# Display the first few rows of the merged dataset
print("Merged dataset:")
print(data.head())

Merged dataset:
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [28]:
# Create a user-item matrix
user_item_matrix = data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
print(user_item_matrix.head())
#print(user_item_matrix.shape)

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [29]:
# Step 4: User-Based Collaborative Filtering
# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
user_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [30]:
# Function to get user-based recommendations
def get_user_based_recommendations(user_id, num_recommendations=5):
    # Find the most similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]
    similar_users_ratings = user_item_matrix.loc[similar_users.index]
    
    # Calculate weighted average of ratings by similar users
    recommendations = similar_users_ratings.mean(axis=0).sort_values(ascending=False)
    
    # Set a name for the recommendations Series to avoid the ValueError
    recommendations.name = "rating"
    
    # Get the top recommendations and merge with movies DataFrame to get titles
    top_recommendations = recommendations.head(num_recommendations).index
    recommended_movies = movies[movies['movieId'].isin(top_recommendations)]
    return recommended_movies.set_index('movieId').join(recommendations, how='inner').sort_values(by='rating', ascending=False)

In [31]:
# Step 5: Item-Based Collaborative Filtering
# Transpose to get item-item matrix
item_item_matrix = user_item_matrix.T

# Calculate cosine similarity between items
item_similarity = cosine_similarity(item_item_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=item_item_matrix.index, columns=item_item_matrix.index)
print(item_similarity_df)

# Function to get item-based recommendations
def get_item_based_recommendations(movie_id, num_recommendations=5):
    # Get most similar items (movies)
    similar_items = item_similarity_df[movie_id].sort_values(ascending=False)[1:num_recommendations + 1]
    
    # Get the top recommendations and merge with movies DataFrame to get titles
    recommended_movies = movies[movies['movieId'].isin(similar_items.index)]
    return recommended_movies.set_index('movieId').join(similar_items, how='inner').rename(columns={movie_id: 'similarity'}).sort_values(by='similarity', ascending=False)


movieId    1         2         3         4         5         6         7       \
movieId                                                                         
1        1.000000  0.410562  0.296917  0.035573  0.308762  0.376316  0.277491   
2        0.410562  1.000000  0.282438  0.106415  0.287795  0.297009  0.228576   
3        0.296917  0.282438  1.000000  0.092406  0.417802  0.284257  0.402831   
4        0.035573  0.106415  0.092406  1.000000  0.188376  0.089685  0.275035   
5        0.308762  0.287795  0.417802  0.188376  1.000000  0.298969  0.474002   
...           ...       ...       ...       ...       ...       ...       ...   
193581   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
193583   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
193585   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
193587   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
193609   0.000000  0.000000 

In [32]:
# Sample Usage:
user_id = 77  # Change as per your testing needs
movie_id = 10  # Change as per your testing needs

print("User-Based Recommendations for User", user_id)
print(get_user_based_recommendations(user_id))

print("\nItem-Based Recommendations for Movie:", movie_id)
print(get_item_based_recommendations(movie_id))

User-Based Recommendations for User 77
                                    title                       genres  \
movieId                                                                  
318      Shawshank Redemption, The (1994)                  Crime|Drama   
356                   Forrest Gump (1994)     Comedy|Drama|Romance|War   
296                   Pulp Fiction (1994)  Comedy|Crime|Drama|Thriller   
593      Silence of the Lambs, The (1991)        Crime|Horror|Thriller   
2571                   Matrix, The (1999)       Action|Sci-Fi|Thriller   

           rating  
movieId            
318      2.305419  
356      2.249589  
296      2.115764  
593      1.906404  
2571     1.905583  

Item-Based Recommendations for Movie: 10
                                     title  \
movieId                                      
165      Die Hard: With a Vengeance (1995)   
380                       True Lies (1994)   
349        Clear and Present Danger (1994)   
377                           

In [33]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Helper function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluation function for User-Based Collaborative Filtering
def evaluate_user_based_cf(user_id, user_item_matrix, user_similarity_df, threshold=1.0):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]
    similar_users_ratings = user_item_matrix.loc[similar_users.index]
    recommendations = similar_users_ratings.mean(axis=0)

    # Extract the target user's ratings (actual ratings)
    actual_ratings = user_item_matrix.loc[user_id]
    
    # Predict ratings only for movies the user has actually rated
    predicted_ratings = recommendations[actual_ratings > 0]
    actual_ratings = actual_ratings[actual_ratings > 0]
    
    # Compute MAE and RMSE
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse_value = rmse(actual_ratings, predicted_ratings)

    # Calculate True Positives and False Positives based on the threshold
    tp = sum((predicted_ratings >= threshold) & (actual_ratings >= threshold))
    fp = sum((predicted_ratings >= threshold) & (actual_ratings < threshold))
    
    print(f"User-Based CF Evaluation for User {user_id}:")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse_value}")
    print(f"True Positives: {tp}")
    print(f"False Positives: {fp}")

# Example Usage
evaluate_user_based_cf(user_id=10, user_item_matrix=user_item_matrix, user_similarity_df=user_similarity_df)


User-Based CF Evaluation for User 10:
MAE: 3.0347525216983344
RMSE: 3.243925726910369
True Positives: 10
False Positives: 3


In [35]:
# Evaluation function for Item-Based Collaborative Filtering
def evaluate_item_based_cf(user_id, user_item_matrix, item_similarity_df, threshold=1.0):
    # Get the user's actual ratings
    actual_ratings = user_item_matrix.loc[user_id]
    
    predicted_ratings = []
    actuals = []

    # Loop through each rated item to predict ratings based on item similarity
    for movie_id, actual_rating in actual_ratings[actual_ratings > 0].items():
        similar_items = item_similarity_df[movie_id].sort_values(ascending=False)[1:]
        similar_items_ratings = user_item_matrix[similar_items.index].loc[user_id]
        predicted_rating = similar_items_ratings.mean()
        
        predicted_ratings.append(predicted_rating)
        actuals.append(actual_rating)
    
    # Compute MAE and RMSE
    mae = mean_absolute_error(actuals, predicted_ratings)
    rmse_value = rmse(actuals, predicted_ratings)

    # Calculate True Positives and False Positives based on the threshold
    tp = sum((np.array(predicted_ratings) >= threshold) & (np.array(actuals) >= threshold))
    fp = sum((np.array(predicted_ratings) >= threshold) & (np.array(actuals) < threshold))
    
    print(f"Item-Based CF Evaluation for User {user_id}:")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse_value}")
    print(f"True Positives: {tp}")
    print(f"False Positives: {fp}")

# Example Usage
evaluate_item_based_cf(user_id=10, user_item_matrix=user_item_matrix, item_similarity_df=item_similarity_df)


Item-Based CF Evaluation for User 10:
MAE: 3.2317050146192385
RMSE: 3.4375250971292957
True Positives: 0
False Positives: 0


In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def user_based_CF(ratings_file, target_user, top_n=10):
    # Load the ratings data
    ratings_df = pd.read_csv(ratings_file)
    
    # Create user-item matrix
    user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    
    # Calculate cosine similarity between users
    user_similarity = cosine_similarity(user_item_matrix)
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
    
    # Find the most similar users to the target user
    similar_users = user_similarity_df[target_user].sort_values(ascending=False)[1:top_n+1].index
    similar_users_ratings = user_item_matrix.loc[similar_users]
    
    # Identify items rated by the target user
    target_user_rated_items = user_item_matrix.loc[target_user]
    rated_items = target_user_rated_items[target_user_rated_items > 0].index
    
    # Get recommended items by excluding already rated items
    recommended_items = set(similar_users_ratings.columns[similar_users_ratings.sum(axis=0) > 0]) - set(rated_items)
    
    # Calculate True Positives and False Positives
    tp = 0
    fp = 0
    for item in recommended_items:
        if any(user_item_matrix.loc[user, item] > 0 for user in similar_users):
            tp += 1  # item was also rated by similar users
        else:
            fp += 1  # item was not rated by any similar user

    print(f"True Positives (User-Based): {tp}")
    print(f"False Positives (User-Based): {fp}")


def item_based_CF(ratings_file, target_movie, top_n=10):
    # Load the ratings data
    ratings_df = pd.read_csv(ratings_file)
    
    # Create item-user matrix (transpose of user-item)
    item_user_matrix = ratings_df.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    
    # Calculate cosine similarity between items
    item_similarity = cosine_similarity(item_user_matrix)
    item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)
    
    # Find the most similar items to the target movie
    similar_items = item_similarity_df[target_movie].sort_values(ascending=False)[1:top_n+1].index
    similar_items_ratings = item_user_matrix.loc[similar_items]
    
    # Find users who watched the target movie
    users_watched_target_movie = item_user_matrix.columns[item_user_matrix.loc[target_movie] > 0]
    
    # Calculate True Positives and False Positives
    tp = 0
    fp = 0
    for item in similar_items:
        if any(item_user_matrix.loc[item, user] > 0 for user in users_watched_target_movie):
            tp += 1  # item was also rated by users who watched target movie
        else:
            fp += 1  # item was not rated by users who watched target movie

    print(f"True Positives (Item-Based): {tp}")
    print(f"False Positives (Item-Based): {fp}")

# Example usage
ratings_file = '../input/movies/ratings.csv'
user_id = 1   # Target user
movie_id = 10 # Target movie

# User-based collaborative filtering evaluation
user_based_CF(ratings_file, user_id)

# Item-based collaborative filtering evaluation
item_based_CF(ratings_file, movie_id)


True Positives (User-Based): 1679
False Positives (User-Based): 0
True Positives (Item-Based): 10
False Positives (Item-Based): 0
