In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
%matplotlib inline

In [3]:
# Recreate the data from the book

data = [
    [1, 7, 6, 7, 4, 5, 4],
    [2, 6, 7, np.NaN, 4, 3, 4],
    [3, np.NaN, 3, 3, 1, 1, np.NaN],
    [4, 1, 2, 2, 3, 3, 4],
    [5, 1, np.NaN, 1, 2, 3, 3]
]

In [8]:
ratings = pd.DataFrame(data, columns=['userId', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
ratings = ratings.set_index('userId')
ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,7.0,6.0,7.0,4.0,5.0,4.0
2.0,6.0,7.0,,4.0,3.0,4.0
3.0,,3.0,3.0,1.0,1.0,
4.0,1.0,2.0,2.0,3.0,3.0,4.0
5.0,1.0,,1.0,2.0,3.0,3.0


In [9]:
# Calculate average ratings for all users

users_avg_rating = ratings.mean(axis=1)
users_avg_rating

userId
1.0    5.5
2.0    4.8
3.0    2.0
4.0    2.5
5.0    2.0
dtype: float64

In [10]:
# Normalize the ratings (make the mean value 0); this is optional but according to the book, predictions made
# with zeroed mean value tend to be a bit better

normalized_ratings = ratings.apply(lambda x: x - users_avg_rating[x.name], axis=1)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,1.5,0.5,1.5,-1.5,-0.5,-1.5
2.0,1.2,2.2,,-0.8,-1.8,-0.8
3.0,,1.0,1.0,-1.0,-1.0,
4.0,-1.5,-0.5,-0.5,0.5,0.5,1.5
5.0,-1.0,,-1.0,0.0,1.0,1.0


In [11]:
normalized_ratings = normalized_ratings.fillna(0)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,1.5,0.5,1.5,-1.5,-0.5,-1.5
2.0,1.2,2.2,0.0,-0.8,-1.8,-0.8
3.0,0.0,1.0,1.0,-1.0,-1.0,0.0
4.0,-1.5,-0.5,-0.5,0.5,0.5,1.5
5.0,-1.0,0.0,-1.0,0.0,1.0,1.0


In [13]:
ratings_matrix = normalized_ratings.as_matrix()
ratings_matrix

  """Entry point for launching an IPython kernel.


array([[ 1.5,  0.5,  1.5, -1.5, -0.5, -1.5],
       [ 1.2,  2.2,  0. , -0.8, -1.8, -0.8],
       [ 0. ,  1. ,  1. , -1. , -1. ,  0. ],
       [-1.5, -0.5, -0.5,  0.5,  0.5,  1.5],
       [-1. ,  0. , -1. ,  0. ,  1. ,  1. ]])

In [14]:
from scipy.sparse.linalg import svds

In [30]:
U, sigma, Vt = svds(ratings_matrix, k = 4)

In [31]:
sigma = np.diag(sigma)
sigma

array([[0.97148121, 0.        , 0.        , 0.        ],
       [0.        , 1.47932936, 0.        , 0.        ],
       [0.        , 0.        , 2.21057331, 0.        ],
       [0.        , 0.        , 0.        , 5.07669879]])

In [32]:
np.dot(np.dot(U, sigma), Vt) + np.reshape(users_avg_rating.values, (-1, 1))

array([[6.97828694, 6.01525884, 7.00784887, 4.01189351, 5.00955447,
        3.97715738],
       [5.98679062, 7.00928288, 4.80477494, 4.00723554, 3.00581256,
        3.98610344],
       [2.02370259, 2.98334302, 2.99143195, 0.98701671, 0.98957007,
        2.02493565],
       [0.96395184, 2.02533283, 2.01303074, 3.01974568, 3.01586239,
        3.96207653],
       [1.0082263 , 1.99421897, 0.99702634, 1.99549398, 2.99638015,
        3.00865425]])

In [28]:
data

array([[ 1.,  7.,  6.,  7.,  4.,  5.,  4.],
       [ 2.,  6.,  7., nan,  4.,  3.,  4.],
       [ 3., nan,  3.,  3.,  1.,  1., nan],
       [ 4.,  1.,  2.,  2.,  3.,  3.,  4.],
       [ 5.,  1., nan,  1.,  2.,  3.,  3.]])

In [29]:
# Well, seems to be pretty much recreted. Let's try predicting now ...

In [66]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [133]:
ratings.reset_index()['']

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,7.0,6.0,7.0,4.0,5.0,4.0
2.0,6.0,7.0,,4.0,3.0,4.0
3.0,,3.0,3.0,1.0,1.0,
4.0,1.0,2.0,2.0,3.0,3.0,4.0
5.0,1.0,,1.0,2.0,3.0,3.0


In [37]:
ratings.reset_index()

Unnamed: 0,userId,m_1,m_2,m_3,m_4,m_5,m_6
0,1.0,7.0,6.0,7.0,4.0,5.0,4.0
1,2.0,6.0,7.0,,4.0,3.0,4.0
2,3.0,,3.0,3.0,1.0,1.0,
3,4.0,1.0,2.0,2.0,3.0,3.0,4.0
4,5.0,1.0,,1.0,2.0,3.0,3.0


In [55]:
ratings_long = pd.melt(ratings.reset_index(), id_vars=['userId'], var_name='movieId', value_name='rating').dropna()
ratings_long.head(5)

Unnamed: 0,userId,movieId,rating
0,1.0,m_1,7.0
1,2.0,m_1,6.0
3,4.0,m_1,1.0
4,5.0,m_1,1.0
5,1.0,m_2,6.0


In [128]:
# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings_long[['userId', 'movieId', 'rating']], Reader())

In [129]:
svd = SVD()

In [130]:
cross_validate(SVD(), data, cv=2)

{'test_rmse': array([1.80013462, 1.76322644]),
 'test_mae': array([1.39103203, 1.36700501]),
 'fit_time': (0.0012319087982177734, 0.001222848892211914),
 'test_time': (0.00016307830810546875, 0.00016427040100097656)}

In [131]:
trainset = data.build_full_trainset()

In [132]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x127666400>

In [134]:
svd.predict(3.0, 'm_1')

Prediction(uid=3.0, iid='m_1', r_ui=None, est=3.0512330356394424, details={'was_impossible': False})

In [135]:
svd.predict(3, 'm_1')

Prediction(uid=3, iid='m_1', r_ui=None, est=3.0512330356394424, details={'was_impossible': False})

In [136]:
svd.predict('3', 'm_1')

Prediction(uid='3', iid='m_1', r_ui=None, est=3.520084941585588, details={'was_impossible': False})

In [137]:
svd.predict('3.0', 'm_1')

Prediction(uid='3.0', iid='m_1', r_ui=None, est=3.520084941585588, details={'was_impossible': False})

In [138]:
svd.predict('3.0', 'm_2')

Prediction(uid='3.0', iid='m_2', r_ui=None, est=3.7192391895157115, details={'was_impossible': False})

In [145]:
svd.predict(1, 'm_1')

Prediction(uid=1, iid='m_1', r_ui=None, est=5, details={'was_impossible': False})

In [95]:
svd.predict(3, 2, verbose=True)

user: 33         item: 2          r_ui = None   est = 3.46   {'was_impossible': False}


Prediction(uid=33, iid=2, r_ui=None, est=3.4615384615384617, details={'was_impossible': False})

In [101]:
trainset.all_ratings()

<generator object Trainset.all_ratings at 0x12372b048>

In [102]:
ratings_long[['userId', 'movieId', 'rating']]

Unnamed: 0,userId,movieId,rating
0,1.0,m_1,7.0
1,2.0,m_1,6.0
3,4.0,m_1,1.0
4,5.0,m_1,1.0
5,1.0,m_2,6.0
6,2.0,m_2,7.0
7,3.0,m_2,3.0
8,4.0,m_2,2.0
10,1.0,m_3,7.0
12,3.0,m_3,3.0


In [108]:
trainset.ur

defaultdict(list,
            {0: [(0, 7.0), (1, 6.0), (2, 7.0), (3, 4.0), (4, 5.0), (5, 4.0)],
             1: [(0, 6.0), (1, 7.0), (3, 4.0), (4, 3.0), (5, 4.0)],
             2: [(0, 1.0), (1, 2.0), (2, 2.0), (3, 3.0), (4, 3.0), (5, 4.0)],
             3: [(0, 1.0), (2, 1.0), (3, 2.0), (4, 3.0), (5, 3.0)],
             4: [(1, 3.0), (2, 3.0), (3, 1.0), (4, 1.0)]})

In [112]:
trainset.all_users()

range(0, 5)

In [115]:
trainset.knows_user(4)

True

In [118]:
data = Dataset.load_builtin('ml-100k')

In [119]:
data

<surprise.dataset.DatasetAutoFolds at 0x1275ceb38>

In [120]:
trainset = data.build_full_trainset()

In [123]:
trainset.all_ratings()

TypeError: 'generator' object is not subscriptable

In [147]:
from surprise import prediction_algorithms

In [148]:
algo = prediction_algorithms.knns.KNNBasic()

In [149]:
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x129f8c0f0>

In [154]:
algo.predict(3, 'm_2')

Prediction(uid=3, iid='m_2', r_ui=None, est=3.2143100898646346, details={'actual_k': 4, 'was_impossible': False})