# Recommender with Naïve Bayes

## Test with a toy data

In [1]:
import numpy as np

X_train = np.array([[0, 1, 1],
                    [0, 0, 1],
                    [0, 0, 0],
                    [1, 1, 0]])
y_train = ['Y', 'N', 'Y', 'Y']
X_test = np.array([[1, 1, 0]])

In [2]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, y_train)

BernoulliNB()

In [4]:
pred_prob = clf.predict_proba(X_test)
print('Predicted probabilites:\n', pred_prob)

Predicted probabilites:
 [[0.07896399 0.92103601]]


In [5]:
pred = clf.predict(X_test)
print('Prediction:', *pred)

Prediction: Y


## Get the data

In [6]:
import os
from collections import defaultdict
data_path = "./datasets/ml-1m"
rating_data = os.path.join(data_path, 'ratings.dat')
n_users = 6040
n_movies = 3952

In [10]:
def load_rating_data(data_path, n_users, n_movies):
    """
    Load rating data from file and also return the number of 
    ratings for each movie and movie_id index mapping
    @param data_path: path of the rating data file
    @param n_users: number of users
    @param n_movies: number of movies that have ratings
    @return: rating data in the numpy array of [user, movie];
             movie_n_rating, {movie_id: number of ratings};
             movie_id_mapping, {movie_id: column index in rating data}
    """
    data = np.zeros([n_users, n_movies], dtype=np.float32)
    movie_id_mapping = {}
    movie_n_rating = defaultdict(int)
    with open(data_path, 'r') as file:
        for line in file.readlines()[1:]:
            user_id, movie_id, rating, _ = line.split("::")
            user_id = int(user_id) - 1
            
            if movie_id not in movie_id_mapping:
                movie_id_mapping[movie_id] = len(movie_id_mapping)
            rating = int(rating)
            data[user_id, movie_id_mapping[movie_id]] = rating
            
            if rating > 0: 
                movie_n_rating[movie_id] += 1
    return data, movie_n_rating, movie_id_mapping

In [14]:
data, movie_n_rating, movie_id_mapping = load_rating_data(rating_data, n_users, n_movies)

In [15]:
def display_distribution(data):
    values, counts = np.unique(data, return_counts=True)
    for value, count in zip(values, counts):
        print(f"Number of rating {int(value)}: {count}")

In [16]:
display_distribution(data)

Number of rating 0: 21384032
Number of rating 1: 56174
Number of rating 2: 107557
Number of rating 3: 261197
Number of rating 4: 348971
Number of rating 5: 226309


we take the movie with the most known ratings as 
our target movie:

In [22]:
movie_id_most, n_rating_most = sorted(movie_n_rating.items(), key=lambda d: d[1], reverse=True)[0]
print(f"Movie ID {movie_id_most} has {n_rating_most} ratings.")

Movie ID 2858 has 3428 ratings.


In [23]:
X_raw = np.delete(data, movie_id_mapping[movie_id_most], axis=1)
y_raw = data[:, movie_id_mapping[movie_id_most]]

discard samples without a rating in movie ID 2858

In [29]:
X = X_raw[y_raw > 0]
y = y_raw[y_raw > 0]
print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

Shape of X: (3428, 3705)
Shape of y: (3428,)


In [32]:
display_distribution(X)

Number of rating 0: 11986488
Number of rating 1: 39995
Number of rating 2: 79832
Number of rating 3: 189550
Number of rating 4: 250231
Number of rating 5: 154644


Consider movies with ratings greater than 3 as being liked (being 
recommended)

In [35]:
recommend = 3
y[y <= recommend] = 0
y[y > recommend] = 1
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
print(f"{n_pos} positive samples and {n_neg} negative samples.")

2853 positive samples and 575 negative samples.


In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
len(X_train), len(X_test), len(y_train), len(y_test)

(2742, 686, 2742, 686)

Notice that the 
values of the input features are from 0 to 5, as opposed to 0 or 1 in our toy example. 
Hence, we use the MultinomialNB module

In [39]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [41]:
prediction_prob = clf.predict_proba(X_test)
prediction_prob[:10]

array([[7.50487439e-23, 1.00000000e+00],
       [1.01806208e-01, 8.98193792e-01],
       [3.57740570e-10, 1.00000000e+00],
       [1.00000000e+00, 2.94095407e-16],
       [1.00000000e+00, 2.49760836e-25],
       [7.62630220e-01, 2.37369780e-01],
       [3.47479627e-05, 9.99965252e-01],
       [2.66075292e-11, 1.00000000e+00],
       [5.88493563e-10, 9.99999999e-01],
       [9.71326867e-09, 9.99999990e-01]])

In [51]:
prob = clf.predict(X_test)
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 4., ..., 0., 0., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.]], dtype=float32)

In [47]:
accuracy = clf.score(X_test, y_test)
print(f"the accuracy is {accuracy*100:.1f}%")

the accuracy is 71.6%
