In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [2]:
ratings=pd.read_csv('ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies=pd.read_csv('ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Content features can be director, genre, production


In [5]:
df = pd.merge(ratings, movies, on='movieId')

In [7]:
#pivot table
pivot=df.pivot_table(values='rating',columns='title', index='userId')

In [8]:
pivot.shape

(671, 9064)

This is a user based collaborative recommender as the index is the user IDs


In [9]:
pivot.T

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",,,,,,,,,,,...,,,,,,,,,,
$9.99 (2008),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Neath the Arizona Skies (1934),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
"'burbs, The (1989)",,,,,,4.0,,,,,...,,,,,,,,,,
'night Mother (1986),,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer (2009),,,,,,,,,,,...,,,3.0,,,,,,,


In [10]:
df.count()

userId       100004
movieId      100004
rating       100004
timestamp    100004
title        100004
genres       100004
dtype: int64

In [11]:
# pivot.T to use item based recommender, otherwise user based
pivot_sparse=sparse.csr_matrix(pivot.T.fillna(0))

In [12]:
print(pivot_sparse)

  (0, 206)	0.5
  (0, 556)	3.0
  (1, 131)	4.5
  (1, 467)	2.5
  (1, 487)	4.5
  (2, 133)	2.0
  (3, 206)	0.5
  (4, 206)	0.5
  (4, 546)	4.0
  (5, 133)	3.5
  (6, 206)	0.5
  (6, 563)	3.0
  (6, 604)	3.0
  (6, 655)	4.0
  (7, 5)	4.0
  (7, 55)	2.0
  (7, 57)	2.0
  (7, 133)	3.0
  (7, 267)	1.5
  (7, 284)	4.0
  (7, 285)	3.0
  (7, 293)	3.5
  (7, 344)	4.5
  (7, 387)	3.0
  (7, 397)	4.0
  :	:
  (9061, 284)	4.0
  (9061, 293)	3.0
  (9061, 305)	3.0
  (9061, 357)	1.0
  (9061, 379)	2.0
  (9061, 387)	4.0
  (9061, 407)	2.0
  (9061, 451)	2.0
  (9061, 456)	1.5
  (9061, 465)	3.0
  (9061, 467)	3.0
  (9061, 471)	3.0
  (9061, 508)	2.0
  (9061, 517)	4.0
  (9061, 563)	3.0
  (9061, 574)	3.0
  (9061, 579)	3.5
  (9061, 580)	3.5
  (9061, 620)	3.5
  (9061, 623)	3.0
  (9061, 637)	4.5
  (9061, 645)	4.0
  (9061, 653)	4.5
  (9062, 480)	4.5
  (9063, 269)	3.5


In [13]:
pivot_sparse.shape

(9064, 671)

recommnder will use the cosine while knn is on eucldien distance

In [14]:
distances = pairwise_distances(pivot_sparse, metric='cosine')

In [15]:
distances.shape

(9064, 9064)

In [17]:
distances_df=pd.DataFrame(distances,index=pivot.columns,columns=pivot.columns)

In [19]:
distances_df.head(3)

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,1.0,1.0,0.835601,0.979609,1.0,0.985954,1.0,1.0,0.996834,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
$9.99 (2008),1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.920526,1.0,0.84367,...,1.0,1.0,1.0,1.0,1.0,0.986101,1.0,0.941782,1.0,1.0
'Hellboy': The Seeds of Creation (2004),1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.782643,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


***Normalized cosone similarity between 0-1***

In [26]:
distances_df["'Hellboy': The Seeds of Creation (2004)"].sort_values()[1:].head(10)

title
'Salem's Lot (2004)       0.000000
1984 (1956)               0.000000
Wrong Turn (2003)         0.298354
Fido (2006)               0.331035
1941 (1979)               0.352850
Changeling, The (1980)    0.360398
Temple Grandin (2010)     0.446628
Wild at Heart (1990)      0.476000
1408 (2007)               0.504926
Descent, The (2005)       0.521087
Name: 'Hellboy': The Seeds of Creation (2004), dtype: float64

## User based Rec

rows,columns, values needed for pivot

In [27]:
user_sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
user_distances = pairwise_distances(user_sparse_pivot, metric='cosine')
user_distances.shape



(671, 671)

In [29]:
user_df=pd.DataFrame(user_distances,index=pivot.index,columns=pivot.index)

In [30]:
user_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,1.0,0.925518,0.983182,1.0,0.916116,1.0,0.987157,1.0,...,1.0,1.0,0.985519,0.956281,1.0,1.0,1.0,0.937083,1.0,0.982534
2,1.0,0.0,0.875705,0.881179,0.896354,1.0,0.787015,0.88681,0.886667,0.956787,...,0.522694,0.936798,0.922216,0.835838,0.533719,0.574538,0.915354,0.97586,0.829405,0.886825
3,1.0,0.875705,0.0,0.91836,0.848469,0.939309,0.845286,0.750219,0.865525,0.885328,...,0.838795,0.935802,0.823778,0.841643,0.822902,0.875438,0.875089,0.919016,0.863394,0.829807
4,0.925518,0.881179,0.91836,0.0,0.869351,0.920352,0.680255,0.808987,0.969583,0.862814,...,0.885681,0.952772,0.863353,0.74597,0.878095,0.911265,0.931517,0.895691,0.945488,0.788391
5,0.983182,0.896354,0.848469,0.869351,0.0,0.936204,0.904112,0.834288,0.913384,0.96763,...,0.808971,0.978858,0.853754,0.775755,0.860279,0.941748,0.957074,0.961642,0.937358,0.774914
