# Content-based Filtering

In [2]:
import pandas as pd

ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', engines='python', names=['userId', 'movieId', 'rating', 'timestamp'])
users = pd.read_csv('ml-1m/users.dat', delimiter='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip-code'])
movies = pd.read_csv('ml-1m/movies.dat', delimiter='::', engine='python', names=['movieId', 'title', 'genres'], encoding='ISO-8859-1')

In [3]:
from sklearn.model_selection import train_test_split

train_ratings, test_ratings = train_test_split(
    ratings, test_size=0.15, random_state=21
)

In [32]:
train_ratings

Unnamed: 0,userId,movieId,rating,timestamp
139957,897,1357,4,975222483
86052,563,2643,1,976046791
383739,2242,3175,5,974596291
988025,5964,1449,4,956997834
806119,4824,300,4,962916212
...,...,...,...,...
81968,543,387,5,976121214
664324,3999,2002,5,965573381
202552,1244,2194,4,974831042
857295,5153,926,3,961970391


In [4]:
users_in_validation = test_ratings["userId"].unique()
all_users = users["userId"].unique()

print(f"There are {len(users_in_validation)} users in test set.")
print(f"Total number of users: {len(all_users)}")

There are 6019 users in test set.
Total number of users: 6040


In [5]:
movie_index_by_id = {id: i for i, id in enumerate(movies["movieId"])}

In [6]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
genre_index_by_name = {name:i for i, name in enumerate(genres)}

import numpy as np
# build binary array for movie genres
movie_features = np.zeros((len(movies), len(genres)))
for i, movie_genres in enumerate(movies["genres"]):
    for genre in movie_genres.split("|"):        
        genre_index = genre_index_by_name[genre]
        movie_features[i, genre_index] = 1

In [7]:
len(movie_index_by_id)

3883

In [8]:
movie_features[0]

array([0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

## Model

Ridge Regression works well when all features contribute a bit to the output, while Lasso is preferred when you believe that only a subset of features are important.

In [9]:
from sklearn.linear_model import Ridge


def train_user_model(user_id):
    user_ratings = train_ratings[train_ratings["userId"] == user_id]
    movie_indexes = [
        movie_index_by_id[movie_id] for movie_id in user_ratings["movieId"]
    ]
    train_data = movie_features[movie_indexes]  # take genre of movie has rating
    train_label = user_ratings["rating"] # rating of that person
    model = Ridge(alpha=0.1)
    model.fit(train_data, train_label)
    return model


# build model for each user
user_model_dict = {}
for user_id in users["userId"].unique():
    user_model_dict[user_id] = train_user_model(user_id)

In [28]:
def predict(user_id, movie_id):
    movie_feature = movie_features[movie_index_by_id[movie_id]].reshape((1, -1))
    pred = user_model_dict[user_id].predict(movie_feature)
    pred = pred.item()  
    
    return min(max(pred, 1), 5)

In [34]:
from sklearn.metrics import root_mean_squared_error

def eval_rmse(ratings):
    predictions = np.zeros(len(ratings))
    for index, row in enumerate(ratings.itertuples(index=False)):
        predictions[index] = predict(row[0], row[1])
    rmse = root_mean_squared_error(ratings["rating"], predictions)
    return float(rmse)
    
print(f"RMSE train: {eval_rmse(train_ratings)}")
print(f"RMSE validation: {eval_rmse(test_ratings)}")

RMSE train: 0.9269734276303346
RMSE validation: 1.045546918972278


In [35]:
user_id = 1
for genre, coef in zip(genres, user_model_dict[user_id].coef_):
    print("{:15s}: {:.3f}".format(genre, coef))

Action         : 0.563
Adventure      : -0.374
Animation      : -0.219
Children's     : 0.310
Comedy         : 0.347
Crime          : -0.062
Documentary    : 0.000
Drama          : 0.621
Fantasy        : -0.989
Film-Noir      : 0.000
Horror         : 0.000
Musical        : 0.152
Mystery        : 0.000
Romance        : -0.773
Sci-Fi         : 0.763
Thriller       : -0.269
War            : 0.364
Western        : 0.000


In [37]:
user_model_dict[user_id].intercept_.item() # this is b[i], bias

3.715134139612832

These are some high scores, showing that we should recommend movies in these genres. For other users whose coefficients are randomly low but the system always avoid suggestions. This is clearly unreasonable because this is just a random value, the system never knows the user's interest in these movies.