In [1]:
import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt



In [165]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1', parse_dates=True) 

r_cols = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

m_cols = ['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

movie_ratings = pd.merge(movies, ratings)
df = pd.merge(movie_ratings, users)

df.head(2)

Unnamed: 0,item_id,title,release_date,video_release_date,imdb_url,user_id,rating,timestamp,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,308,5,887737890,60,M,retired,95076


In [166]:
df.drop(df.columns[[3,4,7]], axis=1, inplace=True)
ratings.drop( 'timestamp', inplace = True, axis = 1 ) 
movies.drop(movies.columns[[3,4]], inplace = True, axis = 1 )

# User-based

In [167]:
ratings_pivot = ratings.pivot_table(index=['item_id'],columns=['user_id'],values='rating').reset_index(drop=True)
ratings_pivot.fillna( 0, inplace = True )
ratings_pivot.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
movie_similarity = 1 - pairwise_distances( ratings_pivot.as_matrix(), metric="cosine" )
ratings_matrix = pd.DataFrame( movie_similarity )
ratings_matrix.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
1,0.402382,1.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
2,0.330245,0.273069,1.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
3,0.454938,0.502571,0.324866,1.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
4,0.286714,0.318836,0.212957,0.334239,1.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211


In [169]:
user_inp="Star Trek: The Wrath of Khan (1982)"
inp=movies[movies['title']==user_inp].index.tolist()
inp=inp[0]

movies['similarity'] = ratings_matrix.iloc[inp]
movies.columns = ['movie_id', 'title', 'release_date','similarity']
movies.head(2)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.547205
1,2,GoldenEye (1995),01-Jan-1995,0.546141


In [170]:
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:10])

Recommended movies based on your choice of  Star Trek: The Wrath of Khan (1982) : 
      movie_id                                          title release_date  \
228       229     Star Trek III: The Search for Spock (1984)  01-Jan-1984   
229       230           Star Trek IV: The Voyage Home (1986)  01-Jan-1986   
226       227  Star Trek VI: The Undiscovered Country (1991)  01-Jan-1991   
171       172                Empire Strikes Back, The (1980)  01-Jan-1980   
175       176                                  Aliens (1986)  01-Jan-1986   
173       174                 Raiders of the Lost Ark (1981)  01-Jan-1981   
194       195                         Terminator, The (1984)  01-Jan-1984   
81         82                           Jurassic Park (1993)  01-Jan-1993   
95         96              Terminator 2: Judgment Day (1991)  01-Jan-1991   

     similarity  
228    0.794698  
229    0.781926  
226    0.742411  
171    0.700934  
175    0.697722  
173    0.695526  
194    0.691278  
8

Recommendations seem to work well

# Item-based

In [171]:
ratings_pivot = ratings.pivot_table(index=['user_id'],columns=['item_id'],values='rating').reset_index(drop=True)
ratings_pivot.fillna( 0, inplace = True )
ratings_pivot.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
movie_similarity = 1 - pairwise_distances( ratings_pivot.as_matrix(), metric="cosine" )
ratings_matrix = pd.DataFrame( movie_similarity )
ratings_matrix.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
2,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [173]:
user_inp="Star Trek: The Wrath of Khan (1982)"
inp=movies[movies['title']==user_inp].index.tolist()
inp=inp[0]

movies['similarity'] = ratings_matrix.iloc[inp]
movies.columns = ['movie_id', 'title', 'release_date','similarity']
movies.head(2)

Unnamed: 0,movie_id,title,release_date,similarity
0,1,Toy Story (1995),01-Jan-1995,0.059926
1,2,GoldenEye (1995),01-Jan-1995,0.201674


In [174]:
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:10])

Recommended movies based on your choice of  Star Trek: The Wrath of Khan (1982) : 
      movie_id                                title release_date  similarity
855       856                Night on Earth (1991)  01-Jan-1991    0.374034
104       105                    Sgt. Bilko (1996)  29-Mar-1996    0.369914
625       626           So Dear to My Heart (1949)  01-Jan-1949    0.363280
240       241     Last of the Mohicans, The (1992)  01-Jan-1992    0.330089
257       258                       Contact (1997)  11-Jul-1997    0.326877
645       646  Once Upon a Time in the West (1969)  01-Jan-1969    0.324176
409       410                       Kingpin (1996)  12-Jul-1996    0.323503
844       845            That Thing You Do! (1996)  28-Sep-1996    0.321609
826       827                      Daylight (1996)  06-Dec-1996    0.319978


For some reason I could not get the item based recommendations working on my own.  
For this reason I fell back to some example code I found on github.

# Second attempt
using mostly the example code from [here](https://github.com/georgezoto/recommender-systems-in-python/blob/master/Implementing%20your%20own%20recommender%20systems%20in%20Python%20by%20Agnes%20Johannsdottir/Implementing%20your%20own%20recommender%20systems%20in%20Python.ipynb)

In [201]:
# Split data
train_data, test_data = cv.train_test_split(df, test_size=0.20)

In [202]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
train_data_matrix = np.zeros((n_users, n_items))
for i, line in train_data.iterrows():
    train_data_matrix[line['user_id']-1, line['item_id']-1] = line['rating']

test_data_matrix = np.zeros((n_users, n_items))
for i, line in test_data.iterrows():
    test_data_matrix[line['user_id']-1, line['item_id']-1] = line['rating']

In [203]:
train_data_matrix

array([[5., 3., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [204]:
n_items

1682

In [205]:
n_users

943

## Evalute performance

In [206]:
%%timeit x = range(10)
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')

40.9 ms ± 1.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [207]:
%%timeit x = range(10)
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

67.6 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Item-based is obviously slower than user-based because there are more items than users which increases the amount of comparisons necessary. 

In [208]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [209]:
pd.DataFrame(user_similarity).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,0.0,0.839556,0.945595,0.943054,0.723745,0.651838,0.629787,0.734427,0.906726,0.699859,...,0.735665,0.90953,0.758396,0.848561,0.832246,0.862319,0.716768,0.866536,0.820158,0.702575
1,0.839556,0.0,0.871041,0.886615,0.928335,0.776843,0.91128,0.907327,0.843702,0.873067,...,0.86387,0.689123,0.673745,0.628225,0.7008,0.842241,0.800647,0.881222,0.826506,0.930278
2,0.945595,0.871041,0.0,0.701709,0.973851,0.930224,0.940335,0.975111,0.979335,0.936385,...,0.959974,0.951022,0.856191,0.920577,0.914409,0.985468,0.845749,0.95475,0.859718,0.980653
3,0.943054,0.886615,0.701709,0.0,0.985892,0.962414,0.920516,0.887525,1.0,0.976887,...,0.957343,0.951065,0.918696,0.826864,0.871725,0.956441,0.841992,0.918621,0.853894,0.965206
4,0.723745,0.928335,0.973851,0.985892,0.0,0.788711,0.719025,0.799994,0.93708,0.827309,...,0.727977,0.937036,0.918993,0.954404,0.909823,0.94887,0.769782,0.894373,0.843532,0.761754


In [210]:
pd.DataFrame(item_similarity).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.668322,0.746073,0.604142,0.771113,0.894057,0.510057,0.617167,0.599482,0.741485,...,0.960656,1.0,1.0,1.0,0.960656,1.0,1.0,1.0,0.947541,0.947541
1,0.668322,0.0,0.757908,0.613534,0.741697,0.899999,0.700067,0.678672,0.775398,0.874193,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.914943
2,0.746073,0.757908,0.0,0.730299,0.810908,0.995739,0.744542,0.828969,0.760239,0.877498,...,1.0,1.0,1.0,1.0,0.96255,1.0,1.0,1.0,1.0,0.887649
3,0.604142,0.613534,0.730299,0.0,0.746437,0.889564,0.634678,0.554478,0.646668,0.788033,...,1.0,1.0,1.0,0.895629,0.958252,1.0,1.0,1.0,0.937378,1.0
4,0.771113,0.741697,0.810908,0.746437,0.0,0.983349,0.724513,0.765387,0.772266,0.950046,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.890236


# Prediction

In [211]:
def predictUser(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred

def predictItem(ratings, similarity):
    return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [212]:
user_prediction = predictUser(train_data_matrix, user_similarity)
item_prediction = predictItem(train_data_matrix, item_similarity)

In [213]:
ratings_matrix_user = pd.DataFrame(user_prediction)
ratings_matrix_user.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1.689873,0.620365,0.50686,0.856012,0.510688,0.384106,1.462456,0.948436,1.206832,0.601101,...,0.317902,0.318578,0.314783,0.316783,0.317461,0.316023,0.318502,0.317263,0.316957,0.31716
1,1.42652,0.33321,0.159022,0.607316,0.174322,0.020384,1.196285,0.692325,0.895691,0.255816,...,-0.056135,-0.054681,-0.059412,-0.057394,-0.056263,-0.058548,-0.056819,-0.057683,-0.056102,-0.056054
2,1.436097,0.277867,0.11513,0.557062,0.121465,-0.024717,1.172713,0.647133,0.882332,0.222201,...,-0.105942,-0.10472,-0.109296,-0.107519,-0.106147,-0.108532,-0.107004,-0.107768,-0.105889,-0.105909
3,1.383434,0.239222,0.082409,0.516002,0.086488,-0.057223,1.131267,0.60231,0.843746,0.18925,...,-0.138007,-0.1369,-0.141197,-0.13911,-0.137932,-0.140382,-0.138752,-0.139567,-0.138066,-0.137824
4,1.478996,0.390506,0.274415,0.64152,0.276326,0.154798,1.264433,0.727614,1.014178,0.380928,...,0.078415,0.079102,0.075288,0.077399,0.078319,0.076521,0.078987,0.077754,0.07789,0.078024


In [214]:
ratings_matrix_item = pd.DataFrame(item_prediction)
ratings_matrix_item.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.386586,0.404578,0.431096,0.387222,0.434435,0.441169,0.39121,0.39951,0.403085,0.424726,...,0.472619,0.470224,0.475312,0.473416,0.469851,0.478005,0.478005,0.478005,0.465649,0.466073
1,0.101338,0.119332,0.114449,0.113043,0.118928,0.116512,0.103407,0.112353,0.104908,0.110321,...,0.120969,0.122171,0.121951,0.119997,0.120676,0.119329,0.119329,0.119329,0.121829,0.121719
2,0.069592,0.073603,0.071344,0.071713,0.072595,0.0724,0.067377,0.072835,0.069253,0.071161,...,0.072415,0.072947,0.072576,0.069566,0.071976,0.0696,0.0696,0.0696,0.073297,0.073159
3,0.036339,0.039093,0.038284,0.03836,0.038648,0.039312,0.03612,0.038546,0.037209,0.038667,...,0.038822,0.039022,0.039262,0.03886,0.039034,0.036297,0.036297,0.036297,0.038828,0.039582
4,0.194287,0.19489,0.215582,0.193402,0.215204,0.236679,0.202134,0.19995,0.213777,0.22303,...,0.238619,0.237533,0.239738,0.239751,0.238839,0.24184,0.24184,0.24184,0.235692,0.236549


# Evaluate

In [215]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [216]:
print('User-based CF RMSE:', rmse(user_prediction, test_data_matrix))
print('Item-based CF RMSE:', rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 3.092080050853013
Item-based CF RMSE: 3.436479839248711


Too me an error of 3 and above seems really high.

# Model-based

In [217]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

## Evalutate performance

In [218]:
%%timeit x = range(10)
u, s, vt = svds(train_data_matrix, k = 20)

83.9 ms ± 9.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


SVD is even slower than the previous techniques.

In [219]:
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('SVD MSE:', rmse(X_pred, test_data_matrix))

SVD MSE: 2.6455781985602567


It seems to perform better than collaborative filtering.

# Movielens 1M

In [225]:
users_1M = pd.read_csv('ml-1m/users.dat', sep='::', names=['user_id', 'sex', 'age','occupation', 'zip_code'],
                    encoding='latin-1', parse_dates=True) 
ratings_1M = pd.read_csv('ml-1m/ratings.dat', sep='::', names=r_cols,
                      encoding='latin-1')
movies_1M = pd.read_csv('ml-1m/movies.dat', sep='::', names=['item_id', 'title','genre'], usecols=range(5),
                     encoding='latin-1')

movie_ratings_1M = pd.merge(movies_1M, ratings_1M)
df_1M = pd.merge(movie_ratings_1M, users_1M)

df_1M.head(2)

  
  after removing the cwd from sys.path.
  


Unnamed: 0,item_id,title,genre,user_id,rating,timestamp,sex,age,occupation,zip_code
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067
1,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1,5,978824351,F,1,10,48067


In [242]:
df_1M.item_id.unique().shape[0]

3706

In [248]:
df_1M.sort_values('item_id').tail(10)

Unnamed: 0,item_id,title,user_id,rating,sex,age,occupation,zip_code
634218,3952,"Contender, The (2000)",319,3,F,50,6,33436
977801,3952,"Contender, The (2000)",1653,2,M,18,0,94044
977776,3952,"Contender, The (2000)",1625,4,M,45,0,4330
946754,3952,"Contender, The (2000)",2473,5,M,25,12,98105
196447,3952,"Contender, The (2000)",1943,3,M,18,4,91501
699068,3952,"Contender, The (2000)",1701,4,F,25,4,97233
920605,3952,"Contender, The (2000)",850,5,M,35,0,60640
935385,3952,"Contender, The (2000)",1197,5,M,35,1,44077
805228,3952,"Contender, The (2000)",1626,1,M,18,4,10010
928294,3952,"Contender, The (2000)",1436,4,F,50,6,97005


The 1M dataset does not have strictly ascending item_ids. This will not cause any problems at all...

In [230]:
df_1M.drop(df_1M.columns[[2,5]], axis=1, inplace=True)
ratings_1M.drop( 'timestamp', inplace = True, axis = 1 ) 
movies_1M.drop('genre', inplace = True, axis = 1 )

In [241]:
n_users = df_1M.user_id.unique().shape[0]
n_items = 3952
data_matrix = np.zeros((n_users, n_items))
for i, line in df_1M.iterrows():
    data_matrix[line['user_id']-1, line['item_id']-1] = line['rating']

In [249]:
data_matrix.shape

(6040, 3952)

In [250]:
%%timeit x = range(10)
user_similarity = pairwise_distances(data_matrix, metric='cosine')

1.61 s ± 52.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [251]:
%%timeit x = range(10)
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

65.5 ms ± 2.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


As expected now that there are more users than items, the item based filter is faster.

In [252]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
print('User-based CF RMSE:', rmse(user_prediction, data_matrix))
print('Item-based CF RMSE:', rmse(item_prediction, data_matrix))

IndexError: index 943 is out of bounds for axis 0 with size 943

My disappointment is immeasurable, and my day is ruined.

In [253]:
%%timeit x = range(10)
u, s, vt = svds(data_matrix, k = 20)

1.27 s ± 95.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


The model based approach seems to scale rather well.

# Sources
* www.salemmarafi.com/code/collaborative-filtering-with-python/
* https://medium.com/@tomar.ankur287/item-item-collaborative-filtering-recommender-system-in-python-cf3c945fae1e
* https://medium.com/@tomar.ankur287/user-user-collaborative-filtering-recommender-system-51f568489727
* https://www.datacamp.com/community/tutorials/recommender-systems-python
* https://github.com/topics/collaborative-filtering
* https://github.com/georgezoto/recommender-systems-in-python
* http://www.cs.carleton.edu/cs_comps/0607/recommend/recommender/itembased.html
* https://surprise.readthedocs.io/en/stable/getting_started.html#getting-started