Let's build a recommendation system using the movie **genres** to recommend new movies. Note that 
1. We need to build a profile for each movie
2. We need to build a profile for each user
3. We compare two to come up with a recommendation.

In [1]:
import pandas as pd

In [3]:
movies = pd.read_csv('../ml-latest-small/movies.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [31]:
def str_series_to_dict_list( series:str ) -> dict:
    return [ dict((g,1) for g in e.split('|') ) for e in series ]

In [42]:
ct = ColumnTransformer([('genres', 
                         Pipeline([('genre-dict', FunctionTransformer(str_series_to_dict_list))
                                 , ('dict', DictVectorizer()) ]),
                         'genres')])

In [44]:
ct.fit_transform(movies[['genres']])

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [100]:
ratings = pd.read_csv('../ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [107]:
ratings['rating'] = ratings.groupby('userId').transform(lambda x: x - x.mean())['rating']

In [108]:
user_ratings = ratings.merge(movies, on="movieId").drop(['movieId', 'timestamp','title'],axis=1)

In [109]:
X = ct.transform(user_ratings[['genres']])

In [110]:
X

<100836x20 sparse matrix of type '<class 'numpy.float64'>'
	with 274480 stored elements in Compressed Sparse Row format>

In [177]:
genre_names = ct.named_transformers_['genres'].named_steps['dict'].feature_names_

In [111]:
user_ratings

Unnamed: 0,userId,rating,genres
0,1,-0.366379,Adventure|Animation|Children|Comedy|Fantasy
1,5,0.363636,Adventure|Animation|Children|Comedy|Fantasy
2,7,1.269737,Adventure|Animation|Children|Comedy|Fantasy
3,15,-0.948148,Adventure|Animation|Children|Comedy|Fantasy
4,17,0.290476,Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...
100831,610,-1.188556,Action|Thriller
100832,610,0.811444,Action|Crime|Drama
100833,610,-0.688556,Action|Drama|Thriller
100834,610,-0.188556,Horror|Thriller


In [178]:
score_sum_by_user = pd.concat([user_ratings[['userId']]
                               , pd.DataFrame(X.toarray() * user_ratings.rating.values.reshape(-1,1)
                                              ,columns=genre_names)],axis=1).groupby('userId').sum().reset_index()
score_sum_by_user

Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,-3.974138,1.857759,9.375000,7.612069,-7.409483,-0.487069,0.000000,11.086207,...,0.633621,-15.228448,0.000000,6.939655,-3.594828,-1.525862,-5.655172,-12.150862,2.939655,-0.564655
1,2,0.0,0.068966,0.655172,0.000000,0.000000,0.362069,-1.482759,1.155172,-1.120690,...,0.000000,-0.948276,-0.793103,0.000000,0.103448,0.551724,-0.293103,-2.482759,0.551724,-0.448276
2,3,0.0,15.897436,3.205128,-7.743590,-9.679487,-12.923077,-3.871795,0.000000,-26.974359,...,0.000000,18.012821,0.000000,-1.935897,2.564103,-9.679487,26.461538,11.948718,-9.679487,0.000000
3,4,0.0,-5.888889,2.888889,2.666667,2.444444,-4.777778,7.000000,0.888889,-8.666667,...,1.777778,2.777778,-0.555556,7.111111,-1.777778,-10.222222,-8.666667,-0.111111,0.111111,2.444444
4,5,0.0,-4.727273,-3.090909,4.181818,4.272727,-2.545455,2.363636,0.000000,4.090909,...,0.000000,-0.636364,0.090909,3.818182,0.363636,-6.000000,-2.272727,-0.727273,-0.909091,-1.272727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,0.0,-72.267265,-22.637668,2.389238,-10.212556,-38.765022,-0.434081,0.713004,91.135426,...,1.240807,-16.184753,-9.518386,3.074439,12.176682,29.623318,-7.934529,-26.322422,8.769058,-4.175785
606,607,0.0,-4.598930,-14.374332,-2.716578,-6.935829,-25.235294,0.775401,0.000000,18.540107,...,0.000000,11.486631,1.213904,-0.930481,14.636364,-7.796791,-19.299465,20.048128,2.283422,0.427807
607,608,0.0,54.333333,15.714200,-0.879663,-59.307461,-141.132371,69.910349,-0.805054,84.930806,...,2.463297,17.984958,10.389892,-12.427798,28.741877,-26.222623,27.092659,104.248496,8.450662,-5.475933
608,609,0.0,-1.972973,-0.702703,-0.270270,-0.540541,0.108108,1.378378,-0.540541,1.864865,...,0.000000,0.459459,-0.270270,0.000000,0.000000,-0.351351,-1.351351,0.216216,0.918919,0.729730


In [179]:
count_by_user = pd.concat([user_ratings[['userId']], pd.DataFrame(X.toarray(),columns=genre_names)],axis=1).groupby('userId').sum().reset_index()
count_by_user

Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,90.0,85.0,29.0,42.0,83.0,45.0,0.0,68.0,...,1.0,17.0,0.0,22.0,18.0,26.0,40.0,55.0,22.0,7.0
1,2,0.0,11.0,3.0,0.0,0.0,7.0,10.0,3.0,17.0,...,0.0,1.0,4.0,0.0,2.0,1.0,4.0,10.0,1.0,1.0
2,3,0.0,14.0,11.0,4.0,5.0,9.0,2.0,0.0,16.0,...,0.0,8.0,0.0,1.0,1.0,5.0,15.0,7.0,5.0,0.0
3,4,0.0,25.0,29.0,6.0,10.0,104.0,27.0,2.0,120.0,...,4.0,4.0,1.0,16.0,23.0,58.0,12.0,38.0,7.0,10.0
4,5,0.0,9.0,8.0,6.0,9.0,15.0,12.0,0.0,25.0,...,0.0,1.0,3.0,5.0,1.0,11.0,2.0,9.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,0.0,151.0,147.0,42.0,49.0,421.0,133.0,5.0,698.0,...,8.0,52.0,16.0,44.0,91.0,355.0,79.0,199.0,65.0,17.0
606,607,0.0,72.0,45.0,6.0,19.0,55.0,27.0,0.0,82.0,...,0.0,35.0,1.0,5.0,17.0,29.0,36.0,61.0,6.0,2.0
607,608,0.0,277.0,181.0,55.0,88.0,355.0,146.0,6.0,280.0,...,4.0,97.0,12.0,33.0,69.0,106.0,167.0,259.0,19.0,11.0
608,609,0.0,11.0,10.0,1.0,2.0,7.0,6.0,2.0,19.0,...,0.0,2.0,1.0,0.0,0.0,5.0,5.0,14.0,4.0,1.0


In [119]:
score_sum_by_user.drop('userId',axis=1).values / count_by_user.drop('userId',axis=1).values

  """Entry point for launching an IPython kernel.


array([[        nan, -0.04415709,  0.02185598, ..., -0.22092476,
         0.13362069, -0.08066502],
       [        nan,  0.00626959,  0.2183908 , ..., -0.24827586,
         0.55172414, -0.44827586],
       [        nan,  1.13553114,  0.29137529, ...,  1.70695971,
        -1.93589744,         nan],
       ...,
       [        nan,  0.19614922,  0.08681878, ...,  0.40250384,
         0.44477168, -0.49781206],
       [        nan, -0.17936118, -0.07027027, ...,  0.01544402,
         0.22972973,  0.72972973],
       [        nan, -0.0879758 ,  0.01743644, ..., -0.11502666,
         0.08803968,  0.05386817]])

In [180]:
user_profile = pd.concat([count_by_user[['userId']],pd.DataFrame(score_sum_by_user.drop('userId',axis=1).values / count_by_user.drop('userId',axis=1).values, columns=genre_names)],axis=1).fillna(0)
user_profile

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,-0.044157,0.021856,0.323276,0.181240,-0.089271,-0.010824,0.000000,0.163032,...,0.633621,-0.895791,0.000000,0.315439,-0.199713,-0.058687,-0.141379,-0.220925,0.133621,-0.080665
1,2,0.0,0.006270,0.218391,0.000000,0.000000,0.051724,-0.148276,0.385057,-0.065923,...,0.000000,-0.948276,-0.198276,0.000000,0.051724,0.551724,-0.073276,-0.248276,0.551724,-0.448276
2,3,0.0,1.135531,0.291375,-1.935897,-1.935897,-1.435897,-1.935897,0.000000,-1.685897,...,0.000000,2.251603,0.000000,-1.935897,2.564103,-1.935897,1.764103,1.706960,-1.935897,0.000000
3,4,0.0,-0.235556,0.099617,0.444444,0.244444,-0.045940,0.259259,0.444444,-0.072222,...,0.444444,0.694444,-0.555556,0.444444,-0.077295,-0.176245,-0.722222,-0.002924,0.015873,0.244444
4,5,0.0,-0.525253,-0.386364,0.696970,0.474747,-0.169697,0.196970,0.000000,0.163636,...,0.000000,-0.636364,0.030303,0.763636,0.363636,-0.545455,-1.136364,-0.080808,-0.303030,-0.636364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,0.0,-0.478591,-0.153998,0.056887,-0.208420,-0.092078,-0.003264,0.142601,0.130567,...,0.155101,-0.311245,-0.594899,0.069874,0.133810,0.083446,-0.100437,-0.132273,0.134909,-0.245634
606,607,0.0,-0.063874,-0.319430,-0.452763,-0.365044,-0.458824,0.028719,0.000000,0.226099,...,0.000000,0.328189,1.213904,-0.186096,0.860963,-0.268855,-0.536096,0.328658,0.380570,0.213904
607,608,0.0,0.196149,0.086819,-0.015994,-0.673948,-0.397556,0.478838,-0.134176,0.303324,...,0.615824,0.185412,0.865824,-0.376600,0.416549,-0.247383,0.162231,0.402504,0.444772,-0.497812
608,609,0.0,-0.179361,-0.070270,-0.270270,-0.270270,0.015444,0.229730,-0.270270,0.098151,...,0.000000,0.229730,-0.270270,0.000000,0.000000,-0.070270,-0.270270,0.015444,0.229730,0.729730


In [127]:
item_profile = ct.fit_transform(movies[['genres']]).toarray()

In [128]:
item_profile

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [129]:
from scipy.spatial.distance import cosine

In [166]:
%%time
import numpy as np
from tqdm import trange

dist = np.zeros(len(movies))

for i in trange(len(movies)):
    dist[i] = cosine(user_profile[user_profile.userId ==3].drop('userId',axis=1).values, item_profile[i,:])
    
    

100%|██████████| 9742/9742 [00:10<00:00, 908.44it/s] 

CPU times: user 10.6 s, sys: 415 ms, total: 11 s
Wall time: 10.7 s





In [167]:
dist.argsort()[-10:]

array([7308, 5160,  786, 1369, 8983, 6230, 1390, 1545,   44,  618])

In [181]:
movies.loc[dist.argsort()[:10]]

Unnamed: 0,movieId,title,genres
5802,31804,Night Watch (Nochnoy dozor) (2004),Action|Fantasy|Horror|Mystery|Sci-Fi|Thriller
9689,184253,The Cloverfield Paradox (2018),Horror|Mystery|Sci-Fi|Thriller
1662,2232,Cube (1997),Horror|Mystery|Sci-Fi|Thriller
4690,7001,Invasion of the Body Snatchers (1978),Horror|Mystery|Sci-Fi|Thriller
7712,90345,"Thing, The (2011)",Horror|Mystery|Sci-Fi|Thriller
5826,32213,Cube Zero (2004),Horror|Mystery|Sci-Fi|Thriller
5980,36509,"Cave, The (2005)",Action|Adventure|Horror|Mystery|Sci-Fi|Thriller
4045,5746,Galaxy of Terror (Quest) (1981),Action|Horror|Mystery|Sci-Fi
2354,3113,End of Days (1999),Action|Fantasy|Horror|Mystery|Thriller
6034,39400,"Fog, The (2005)",Action|Horror|Mystery|Thriller


In [169]:
movies.merge(ratings[ratings.userId==3],on="movieId").sort_values('rating',ascending=False).head(100)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
28,5181,Hangar 18 (1980),Action|Sci-Fi|Thriller,3,2.564103,1306463718
37,70946,Troll 2 (1990),Fantasy|Horror,3,2.564103,1306463815
35,7991,Death Race 2000 (1975),Action|Sci-Fi,3,2.564103,1306463684
5,849,Escape from L.A. (1996),Action|Adventure|Sci-Fi|Thriller,3,2.564103,1306463611
33,6835,Alien Contamination (1980),Action|Horror|Sci-Fi,3,2.564103,1306463670
21,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller,3,2.564103,1306463925
31,5919,Android (1982),Sci-Fi,3,2.564103,1306463825
24,3703,"Road Warrior, The (Mad Max 2) (1981)",Action|Adventure|Sci-Fi|Thriller,3,2.564103,1306463603
29,5746,Galaxy of Terror (Quest) (1981),Action|Horror|Mystery|Sci-Fi,3,2.564103,1306463708
26,4518,The Lair of the White Worm (1988),Comedy|Horror,3,2.564103,1306463770
