# Collaborative Filtering Baseline Model

In this notebook we build a collaborative filtering model to serve as a baseline

### Imports

In [56]:
import json
import os
import random
import numpy as np
import pandas as pd

random.seed(42)
np.random.seed(42)

### Load Training Data

In [57]:
OUTPUT_DATA_DIR = "./output_data/"

train_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_training.csv")

In [58]:
pd.set_option('display.max_columns', None)

### Load Validation Data

In [59]:
val_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_validation.csv")

### Collaborative Filtering - Item to Item Similarity

The predicted rating will be the average of the average rating for the most similar books.

We will be using kNN and so the predicted rating for a book will be the average rating for the `k` closest books

In [60]:
train_df['book_id'] = train_df['book_id'].astype("category")
train_df['user_id'] = train_df['user_id'].astype("category")

In [61]:
import scipy.sparse as sp

item_matrix = train_df.pivot(index='book_id', columns='user_id', values='rating').fillna(0)
item_train_matrix = sp.csr_matrix(item_matrix.values)

We now fit a few KNN models for various values of `k`. Note that there are way more users than books and so we will keep `k` relatively small. We will try `k = [1, 2, 5, 10]` initially.

In [62]:
from sklearn.neighbors import NearestNeighbors

train_item_avg = train_df.groupby(train_df['book_id'], as_index=False)['rating'].mean()
train_item_avg.columns = ['book_id', 'book_average']
train_item_avg = train_item_avg.set_index('book_id')

In [63]:
def build_knn_model(train_matrix, k):
    """Builds a kNN model on `train_matrix` with `k` neighbours.
    
    Parameters
    ----------
    train_matrix: sp.csr_matrix
        The sparse matrix used to build the kNN model.
    k: int
        The number of neighbours to use in the kNN model.
    
    Returns
    -------
    NearestNeighbors
        A NearestNeighbors model fit to `train_matrix`.
    
    """
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k)
    model_knn.fit(train_matrix)
    return model_knn

In [64]:
def get_item_preds_from_knn(knn_model, train_matrix, items, item_avgs):
    """Gets the kNN predictions for the items in `items`.
    
    This assumes that every item in items was fit on the
    knn_model. This is just a precomputation step to get
    the predictions for items in the training set.
    
    Parameters
    ----------
    knn_model: NearestNeighbors
        A NearestNeighbors model that has been fit.
    train_matrix: sp.csr_matrix
        The sparse matrix representing the training data.
    items: np.array
        An array of item indices for items in `knn_model`.
    item_avgs: pd.DataFrame
        A pandas dataframe containing the average rating for
        each item in `items`.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame containing the predicted rating for each item
        in `items`.
    
    """
    item_neighbors = np.asarray(knn_model.kneighbors(train_matrix, return_distance=False))
    knn_avgs = np.zeros(len(item_neighbors))   # this is more efficient than appending multiple times (no resizing)
    for i in range(len(item_neighbors)):
        knn_avgs[i] = item_avgs['book_average'][items[item_neighbors[i]]].mean()    # average of average ratings for neighbors
    return pd.concat([pd.DataFrame(items, columns=['book_id']),
                      pd.DataFrame(knn_avgs, columns=['book_rating'])],
                    axis=1)

In [77]:
def predict_ratings(X, item_preds, default_val, merge_col):
    """Predicts the item ratings for the items in `X`.
    
    Parameters
    ----------
    X: pd.DataFrame
        The DataFrame of features.
    item_preds: pd.DataFrame
        The DataFrame of predicted ratings for the items.
    default_val: float
        A default rating used for unseen items.
    merge_col: str
        The column to merge on.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame containing the predicted item ratings for
        the records in `X`.
    
    """
    id_col = "{}_id".format(merge_col)
    rating_col = "{}_rating".format(merge_col)
    df_item = pd.merge(X, item_preds, how='left', on=[id_col])
    df_item[rating_col] = df_item[rating_col].fillna(default_val)
    df_item.index = X.index
    return df_item[rating_col].apply(lambda x: 1 if x > 3 else 0)

In [81]:
def get_item_knn_train_validation_preds(train_df, val_df, train_matrix, k, items, item_avgs):
    """Gets predictions on `train_df` and `val_df` from a kNN model.
    
    Parameters
    ----------
    train_df: pd.DataFrame
        A DataFrame of the training data.
    val_df: pd.DataFrame
        A DataFrame of the validation data.
    train_matrix: sp.csr_matrix
        The sparse matrix used to train the kNN model.
    k: int
        The number of neighbours in the kNN model.
    items: np.array
        An array of strings representing the ids of the
        items used in training.
    item_avgs: pd.DataFrame
        A DataFrame containing the average rating for the
        items in `items`.
    
    Returns
    -------
    np.array, np.array
        Arrays of predictions on the training and validation sets, respectively.
    
    """
    knn_model = build_knn_model(train_matrix, k)
    knn_preds = get_item_preds_from_knn(knn_model, train_matrix, items, item_avgs)
    
    # prediction for a new book
    new_book_vec = np.zeros(train_matrix.shape[1])
    new_book_neighbours = knn_model.kneighbors(new_book_vec.reshape(1, -1), return_distance=False)
    new_book_pred = item_avgs['book_average'][items[new_book_neighbours[0]]].mean()
    
    train_pred = predict_ratings(train_df, knn_preds, new_book_pred, "book")
    val_pred = predict_ratings(val_df, knn_preds, new_book_pred, "book")
    return train_pred, val_pred

In [122]:
from sklearn.metrics import roc_auc_score

k_vals = [1, 2, 5, 10]
train_MSEs = [None for _ in range(4)]
val_MSEs = [None for _ in range(4)]

for i in range(len(k_vals)):
    k = k_vals[i]
    print("kNN with k = {}".format(k))
    print("---------------")
    train_preds, val_preds = get_item_knn_train_validation_preds(
        train_df, val_df, item_train_matrix, k, item_matrix.index, train_item_avg)
    train_MSEs[i] = roc_auc_score(train_preds, train_df['recommended'])
    val_MSEs[i] = roc_auc_score(val_preds, val_df['recommended'])
    print("Training AUC: {}".format(train_MSEs[i]))
    print("Validation AUC: {}".format(val_MSEs[i]))
    print()

kNN with k = 1
---------------
Training AUC: 0.6773718489378283
Validation AUC: 0.6615090992379128

kNN with k = 2
---------------
Training AUC: 0.6394047663192072
Validation AUC: 0.6330733011470107

kNN with k = 5
---------------
Training AUC: 0.6450947447633038
Validation AUC: 0.6410409395836281

kNN with k = 10
---------------
Training AUC: 0.655245438999922
Validation AUC: 0.6552284090631739



The different values of k don't seem to make too much difference

In [123]:
RESULTS_DIR = './results/'

if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

In [127]:
item_item_cf = pd.DataFrame({'k': k_vals,
                             'trainMSE': train_MSEs,
                             'valMSE': val_MSEs})
item_item_cf.to_csv(RESULTS_DIR+"itemToItemCF.csv", index=False)

### Collaborative Filtering - User to User Similarity

The predicted rating will be the average of the average rating for the most similar users.

We will be using kNN and so the predicted rating for a user will be the average rating for the `k` closest users

Using the full dataset there are too many users. So we subsample instead.

In [128]:
sub_train_df = train_df.sample(frac=0.2)

In [129]:
user_matrix = sub_train_df.pivot(index='user_id', columns='book_id', values='rating').fillna(0)
user_train_matrix = sp.csr_matrix(user_matrix.values)

We now fit a few KNN models for various values of `k`. Note that there are way more users than books and so we can afford a larger value of `k`. We will try `k = [1, 2, 5, 10, 20, 50]` initially.

In [130]:
train_user_avg = sub_train_df.groupby(sub_train_df['user_id'], as_index=False)['rating'].mean()
train_user_avg.columns = ['user_id', 'user_average']
train_user_avg = train_user_avg.set_index('user_id')

In [131]:
def get_user_preds_from_knn(knn_model, train_matrix, users, user_avgs):
    """Gets the kNN predictions for the user in `users`.
    
    This assumes that `knn_model` was fit on every user in 
    `users`. This is just a precomputation step to get
    the predictions for users in the training set.
    
    Parameters
    ----------
    knn_model: NearestNeighbors
        A NearestNeighbors model that has been fit.
    train_matrix: sp.csr_matrix
        The sparse matrix representing the training data.
    users: np.array
        An array of user ids for users in `knn_model`.
    user_avgs: pd.DataFrame
        A pandas dataframe containing the average rating for
        each user in `users`.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame containing the predicted rating for each user
        in `users`.
    
    """
    user_neighbors = np.asarray(knn_model.kneighbors(train_matrix, return_distance=False))
    knn_avgs = np.zeros(len(user_neighbors))   # this is more efficient than appending multiple times (no resizing)
    for i in range(len(user_neighbors)):
        knn_avgs[i] = user_avgs['user_average'][users[user_neighbors[i]]].mean()    # average of average ratings for neighbors
    return pd.concat([pd.DataFrame(users, columns=['user_id']),
                      pd.DataFrame(knn_avgs, columns=['user_rating'])],
                    axis=1)

In [132]:
def get_user_knn_train_validation_preds(train_df, val_df, train_matrix, k, users, user_avgs):
    """Gets predictions on `train_df` and `val_df` from a kNN model.
    
    Parameters
    ----------
    train_df: pd.DataFrame
        A DataFrame of the training data.
    val_df: pd.DataFrame
        A DataFrame of the validation data.
    train_matrix: sp.csr_matrix
        The sparse matrix used to train the kNN model.
    k: int
        The number of neighbours in the kNN model.
    users: np.array
        An array of strings representing the ids of the
        users used in training.
    user_avgs: pd.DataFrame
        A DataFrame containing the average rating for the
        users in `users`.
    
    Returns
    -------
    np.array, np.array
        Arrays of predictions on the training and validation sets, respectively.
    
    """
    knn_model = build_knn_model(train_matrix, k)
    knn_preds = get_user_preds_from_knn(knn_model, train_matrix, users, user_avgs)
    
    # prediction for a new user
    new_user_vec = np.zeros(train_matrix.shape[1])
    new_user_neighbours = knn_model.kneighbors(new_user_vec.reshape(1, -1), return_distance=False)
    new_user_pred = user_avgs['user_average'][users[new_user_neighbours[0]]].mean()
    
    train_pred = predict_ratings(train_df, knn_preds, new_user_pred, "user")
    val_pred = predict_ratings(val_df, knn_preds, new_user_pred, "user")
    return train_pred, val_pred

In [None]:
k_vals = [1, 2, 5, 10]
train_MSEs = [None for _ in range(4)]
val_MSEs = [None for _ in range(4)]

for i in range(len(k_vals)):
    k = k_vals[i]
    print("kNN with k = {}".format(k))
    print("---------------")
    train_preds, val_preds = get_user_knn_train_validation_preds(
        sub_train_df, val_df, user_train_matrix, k, user_matrix.index, train_user_avg)
    train_MSEs[i] = roc_auc_score(train_preds, sub_train_df['recommended'])
    val_MSEs[i] = roc_auc_score(val_preds, val_df['recommended'])
    print("Training AUC: {}".format(train_MSEs[i]))
    print("Validation AUC: {}".format(val_MSEs[i]))
    print()

kNN with k = 1
---------------


In [None]:
user_user_cf = pd.DataFrame({'k': k_vals,
                             'trainMSE': train_MSEs,
                             'valMSE': val_MSEs})

user_user_cf.to_csv(RESULTS_DIR+"userToUserCF.csv", index=False)