In [1]:
while True:
  try:
    import pandas as pd
    import numpy as np
    from implicit.als import AlternatingLeastSquares
    from implicit.nearest_neighbours import bm25_weight
    from scipy.sparse import csr_matrix

    break
  except ModuleNotFoundError:
    %pip install implicit

In [2]:
df = pd.read_csv('vodclickstream_uk_movies_03.csv', encoding='utf-8', index_col=0)

In [3]:
df.head()

Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


# Duration

In [4]:
df = df[df.duration > 0]
df.head()

Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
58779,2017-01-01 19:43:06,4903.0,The Water Diviner,"Drama, History, War",2014-12-26,7165c2fc94,8e1be40e32
58781,2017-01-01 19:46:24,3845.0,Ratter,"Drama, Horror, Thriller",2016-02-12,c39aae36c3,cff8ea652a
58784,2017-01-01 20:55:46,6175.0,28 Days,"Comedy, Drama",2000-04-14,584bffaf5f,759ae2eac9


In [5]:
users, movies = df.user_id.nunique(), df.movie_id.nunique()
print('Number of users: {}, Number of movies: {}'.format(users, movies))

Number of users: 109761, Number of movies: 7634


In [6]:
grouped = df.groupby(['user_id', 'movie_id']).size().reset_index(name='count')
user_movie_matrix = grouped.pivot(index='user_id', columns='movie_id', values='count').fillna(0)

In [7]:
user_item_matrix = user_movie_matrix.values

In [8]:
user_item_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
sparse_user_item = csr_matrix(user_item_matrix)
sparse_user_item = bm25_weight(sparse_user_item, K1=100, B=0.8)
sparse_user_item = csr_matrix(sparse_user_item)

In [11]:
model = AlternatingLeastSquares(factors=100, regularization=0.01, iterations=50)

In [12]:
model.fit(sparse_user_item)

100%|██████████| 50/50 [04:09<00:00,  5.00s/it]


In [16]:
model.save('netflix_als_model')

In [23]:
movie_ids, scores = model.recommend(0, sparse_user_item[0], N=10, filter_already_liked_items=True)

# Suggested Movies

In [24]:
suggested_movies = df.iloc[movie_ids].copy()
suggested_movies['scores'] = scores
suggested_movies

Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id,scores
69538,2017-02-01 15:48:41,21896.0,Ex Machina,"Drama, Mystery, Sci-Fi",2015-04-24,eddd003dbb,9ac1efa3de,0.23793
63455,2017-01-11 16:25:11,21248.0,The Hunger Games: Catching Fire,"Action, Adventure, Mystery, Sci-Fi, Thriller",2013-11-22,16d373f59e,0587802324,0.21542
69644,2017-02-02 09:34:28,878.0,The Rebound,"Comedy, Romance",2009-09-16,fce3d2c448,ae265e8ed4,0.212309
60856,2017-01-05 20:09:24,4446.0,Killswitch,"Documentary, Drama",2014-10-18,4fca5b34a0,1658d78201,0.212091
61117,2017-01-05 18:23:54,232.0,Dance Flick,"Action, Comedy, Music",2009-05-22,58b31713f4,60455cc93f,0.175233
63547,2017-01-12 01:38:55,207.0,November Rule,Comedy,2015-02-14,336689ad43,f6a0727774,0.166588
66898,2017-01-22 13:41:10,310.0,Night at the Museum: Secret of the Tomb,"Adventure, Comedy, Family, Fantasy",2014-12-19,05cc359218,17db8bc8c7,0.151395
69465,2017-02-01 19:52:10,699.0,The Book of Life,"Animation, Adventure, Comedy, Family, Fantasy,...",2014-10-17,97183b9136,305c2599d9,0.148715
59841,2017-01-03 08:14:21,4905.0,Escorts,Documentary,2015-05-21,2dfb5d54fe,645b9904c8,0.14469
63560,2017-01-12 18:43:55,6236.0,Waffle Street,"Comedy, Drama",2016-03-15,8aa4f5e56e,dd0ff037b2,0.138185


In [25]:
# Make recommendations for the users in the dataset
userids = np.arange(10)
ids, scores = model.recommend(userids, sparse_user_item[userids], N=10, filter_already_liked_items=True)
ids, ids.shape

(array([[6898, 3048, 6963, 1386, 1561, 3100, 5273, 6856,  728, 3107],
        [5341,  867, 2590, 7194, 3222, 4714, 5305, 5741, 6092, 7498],
        [5440, 6173, 3563, 2293, 6462, 4290,  926, 1386, 2043,   96],
        [7179, 2634, 6455, 6045, 1789, 1190, 3572, 2759, 4720, 2650],
        [6084, 6963, 5810,  688,  463, 5206, 1844, 4144, 2530,  728],
        [4251,  119, 2590, 5341, 1154,  103, 7198,  259, 5471, 4177],
        [ 461,  772,  477, 5357, 1912, 5393,  728, 5233, 7614, 5014],
        [ 605, 6963, 4617, 6069, 7023, 2541, 5444, 5752, 1899, 3053],
        [3898, 4231, 6985, 6456, 5381,  368,  556, 6570, 4505, 3301],
        [4251, 1876, 1899, 1246, 2080, 6152,  766, 2026,  528, 7362]]),
 (10, 10))

In [26]:
ids, scores = model.similar_users(0, N=5, filter_users=[0])
ids, scores

(array([32268, 94732, 54987,  8532, 96678]),
 array([0.69335085, 0.6794188 , 0.6444371 , 0.5874675 , 0.5874673 ],
       dtype=float32))

In [27]:
df[df.user_id==df.iloc[0].user_id]

Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
58806,2017-01-01 18:13:17,3600.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
59748,2017-01-02 00:26:14,3816.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
60591,2017-01-04 19:01:15,7366.0,Shrek 2,"Animation, Adventure, Comedy, Family, Fantasy,...",2004-05-19,f1fccba87c,7cbcc791bf


In [28]:
df.groupby('user_id')['movie_id'].count().sort_values(ascending=False)

user_id
b15926c011    736
779343a3ea    483
89fbb087f3    278
7c53ece165    186
322f2bd4d4    179
             ... 
9783735255      1
9784da20d6      1
35ebf65066      1
97894db61d      1
a89fc145e0      1
Name: movie_id, Length: 109761, dtype: int64