# Basic Movie Recommender

This notebook uses Chapter 9 contents of Mining of Massive Datasets to implement mini movie recommendation system. 

## Prerequisite

Download the MovieLens 100k dataset and unzip under `data` folder.

## Goal

* Implement user-user collaborative filtering and item-item collaborative filtering, and compare them.
* Calculate metrics: Precision@10, Recall@10, NDCG
* Using 5-fold cross validation while developing the model.
* Use `ua` and `ub` data for final testing.

In [83]:
# install necessary packages
%pip install pandas scikit-learn numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [84]:
import pandas as pd

def load_rating_data(file_path):
    df = pd.read_csv(file_path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    return df

def load_user_data(file_path):
    df = pd.read_csv(file_path, sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
    return df

def load_item_data(file_path):
    df = pd.read_csv(file_path, sep='|', encoding='latin-1', names=[
        'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
        'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy',
        'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
        'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ])
    return df

user_df = load_user_data('./data/ml-100k/u.user')
item_df = load_item_data('./data/ml-100k/u.item')

rating_file = './data/ml-100k/u1.base'
rating_df = load_rating_data(rating_file)

print(rating_df.head())

   user_id  item_id  rating  timestamp
0        1        1       5  874965758
1        1        2       3  876893171
2        1        3       4  878542960
3        1        4       3  876893119
4        1        5       3  889751712


In [85]:

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class UserColaborativeFiltering:
    def __init__(self, rating_df):
        self.user_item_matrix = self.create_user_item_matrix(rating_df)
        self.similarity_matrix = self.user_user_similarity_matrix(self.user_item_matrix)

    def normalize_ratings(self, user_item_matrix, mean_user_rating):
        normalized_matrix = user_item_matrix.subtract(mean_user_rating, axis=0)
        return normalized_matrix
    
    def create_user_item_matrix(self, rating_df):
        user_item_matrix = rating_df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
        return user_item_matrix

    def user_user_similarity_matrix(self, user_item_matrix):
        similarity_matrix = cosine_similarity(user_item_matrix)
        similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)
        return similarity_df

    def predict_rating(self, X):
        """
            X: DataFrame with columns ['user_id', 'item_id']
            return predicted ratings for the given user-item pairs
        """

        predictions = []
        for _, row in X.iterrows():
            user_id = row['user_id']
            item_id = row['item_id']

            if item_id not in self.user_item_matrix.columns:
                predictions.append(0)
                continue

            user_similarities = self.similarity_matrix[user_id]
            k = 30
            top_k_similar = user_similarities.nlargest(k+1).iloc[1:]
            ratings_of_top_k_users = self.user_item_matrix.loc[top_k_similar.index, item_id]
            non_zero_mean = ratings_of_top_k_users[ratings_of_top_k_users != 0].mean()
            if np.isnan(non_zero_mean):
                non_zero_mean = 0

            # need to round because the ratings are integers
            non_zero_mean = round(non_zero_mean, 0)
            predictions.append(non_zero_mean)

        return predictions



In [86]:
ucf = UserColaborativeFiltering(rating_df)

test_rating_file = './data/ml-100k/u1.test'
test_rating_df = load_rating_data(test_rating_file)
test_samples = test_rating_df[['user_id', 'item_id']].head(5)

def evaluate_rmse(true_ratings, predicted_ratings):
    true_ratings = np.array(true_ratings)
    predicted_ratings = np.array(predicted_ratings)
    mse = np.mean((true_ratings - predicted_ratings) ** 2)
    rmse = np.sqrt(mse)
    return rmse

predictions = ucf.predict_rating(test_samples)
rmse = evaluate_rmse(test_rating_df['rating'].head(5), predictions)
print(f'RMSE on test samples: {rmse}')

RMSE on test samples: 2.32379000772445


In [87]:
class ItemColaborativeFiltering:
    def __init__(self, rating_df):
        self.user_item_matrix = self.create_user_item_matrix(rating_df)
        self.similarity_matrix = self.item_item_similarity_matrix(self.user_item_matrix)

    def create_user_item_matrix(self, rating_df):
        user_item_matrix = rating_df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
        return user_item_matrix

    def item_item_similarity_matrix(self, user_item_matrix):
        similarity_matrix = cosine_similarity(user_item_matrix.T)
        similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)
        return similarity_df

    def predict_rating(self, X):
        """
            X: DataFrame with columns ['user_id', 'item_id']
            return predicted ratings for the given user-item pairs
        """

        predictions = []
        for _, row in X.iterrows():
            user_id = row['user_id']
            item_id = row['item_id']

            if item_id not in self.user_item_matrix.columns:
                predictions.append(0)
                continue

            item_similarities = self.similarity_matrix[item_id]
            k = 30
            top_k_similar = item_similarities.nlargest(k+1).iloc[1:]
            ratings_of_top_k_items = self.user_item_matrix.loc[user_id, top_k_similar.index]
            non_zero_mean = ratings_of_top_k_items[ratings_of_top_k_items != 0].mean()
            if np.isnan(non_zero_mean):
                non_zero_mean = 0

            # need to round because the ratings are integers
            non_zero_mean = round(non_zero_mean, 0)
            predictions.append(non_zero_mean)

        return predictions

In [88]:
icf = ItemColaborativeFiltering(rating_df)
predictions = icf.predict_rating(test_samples)
rmse = evaluate_rmse(test_rating_df['rating'].head(5), predictions)
print(f'RMSE on test samples: {rmse}')

RMSE on test samples: 0.6324555320336759


In [89]:
def validate(train_file, test_file, model_class):
    train_rating_df = load_rating_data(train_file)
    model = model_class(train_rating_df)
    test_rating_df = load_rating_data(test_file)
    test_samples = test_rating_df[['user_id', 'item_id']]
    predictions = model.predict_rating(test_samples)
    rmse = evaluate_rmse(test_rating_df['rating'], predictions)
    return rmse
    

# run A test
train_file = './data/ml-100k/ua.base'
test_file = './data/ml-100k/ua.test'
rmse_ucf = validate(train_file, test_file, UserColaborativeFiltering)
rmse_icf = validate(train_file, test_file, ItemColaborativeFiltering)
print(f'UserCF RMSE on A test: {rmse_ucf}')
print(f'ItemCF RMSE on A test: {rmse_icf}')

if (rmse_ucf < rmse_icf):
    print("UserCF performs better on A test")
else:
    print("ItemCF performs better on A test")

# run B test
train_file = './data/ml-100k/ub.base'
test_file = './data/ml-100k/ub.test'
rmse_ucf = validate(train_file, test_file, UserColaborativeFiltering)
rmse_icf = validate(train_file, test_file, ItemColaborativeFiltering)
print(f'UserCF RMSE on B test: {rmse_ucf}')
print(f'ItemCF RMSE on B test: {rmse_icf}')
if (rmse_ucf < rmse_icf):
    print("UserCF performs better on B test")
else:
    print("ItemCF performs better on B test")

UserCF RMSE on A test: 1.25231492961109
ItemCF RMSE on A test: 1.2804509412458698
UserCF performs better on A test
UserCF RMSE on B test: 1.2442444376297772
ItemCF RMSE on B test: 1.275721574453031
UserCF performs better on B test
