In [1]:
import pandas as pd

# Load Movies Metadata
books_df = pd.read_csv('books/books.csv', low_memory=False)

In [2]:
books_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l,Summary,Language,Category,city,state,country
0,0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
2,2,11400,"ottawa, ontario, canada",49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],ottawa,ontario,canada
3,3,11676,"n/a, n/a, n/a",34.7439,2005018,8,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],,,
4,4,41385,"sudbury, ontario, canada",34.7439,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],sudbury,ontario,canada


In [3]:
# create a smaller selection since calculating is costly (and takes forever)
user_ratings = books_df[['user_id', 'rating', 'book_title']].drop_duplicates(['user_id', 'book_title'])

In [4]:
# one way to create smaller selection is by looking at average voting by a user

# count total reviews
avg_user = user_ratings.groupby(['user_id']).size().reset_index(name='counts')

# set index for later
avg_user = avg_user.set_index('user_id')

# select x-number of reviews
avg_user = avg_user[avg_user['counts'] > 50]

# user the index to create an list of ids that have more than x-number 
user_index = avg_user.index

# iloc (so look for the user id which is int) and make a smaller selection now including the review
#smaller_selection = user_ratings[user_ratings['user_id'] == users_index]

smaller_selection = user_ratings.loc[user_ratings['user_id'].isin(user_index)]

# previous steps could be easily chained together requiring less lines of code. But that does not help the explainability of the code


In [5]:
#user_item matrix
user_ratings_pivot = smaller_selection.pivot(index='user_id', columns='book_title', values='rating')
user_ratings_pivot.loc[278633].sort_values(ascending=False).head(20)

book_title
I Know This Much Is True                                                      10.0
Christmas Box (Christmas Box Trilogy)                                         10.0
Plantation: A Lowcountry Tale                                                 10.0
Sole Survivor                                                                  9.0
The Lovely Bones: A Novel                                                      9.0
Inner Hunger: A Young Woman's Struggle Through Anorexia and Bulimia            9.0
Jay's Journal                                                                  9.0
Cut                                                                            9.0
Cold Sassy Tree                                                                9.0
Tell Me Your Dreams                                                            9.0
Timepiece (Christmas Box Trilogy)                                              9.0
Summer Sisters                                                              

## Dealing with missing values
how do we deal with the NaN values?
neither droping nor filling with zero is a good idea
dropping means getting rid of almost all the data, because this is sparse matrix and the nan values are actually at the core of any RS
filling with zero also would create a rating 0 (dislike) for an item that the user has not rated
instead we center each users rating around 0 by deducting the row average and then filling the missing values by 0
this way the missing value is replaced with neutral scores. It's not a perfect solution we lose some interpretabilty, but works for now when comparing between users

In [7]:
#
avg_ratings = user_ratings_pivot.mean(axis=1)
#save for later
user_ratings_table = user_ratings_pivot
user_ratings_pivot = user_ratings_pivot.sub(avg_ratings, axis=0)
#print(user_ratings_pivot)

In [8]:
user_ratings_pivot = user_ratings_pivot.fillna(0)
user_ratings_pivot.loc[49154].sort_values(ascending=False).head(20)

book_title
Three Weeks in Paris                                                                                                                                                     8.651786
Exclusive                                                                                                                                                                8.651786
Christmas Box (Christmas Box Trilogy)                                                                                                                                    8.651786
Family Scrapbooks: Yesterday, Today, and Tomorrow                                                                                                                        8.651786
Timepiece (Christmas Box Trilogy)                                                                                                                                        8.651786
The Handmaid's Tale                                                                                

In [57]:
user_ratings_pivot.loc[278633]

book_title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    0.0
 Always Have Popsicles                                                                                        0.0
 Apple Magic (The Collector's series)                                                                         0.0
 Ask Lily (Young Women of Faith: Lily Series, Book 5)                                                         0.0
 Beyond IBM: Leadership Marketing and Finance for the 1990s                                                   0.0
                                                                                                             ... 
Ã?Â?berraschung am Valentinstag.                                                                              0.0
Ã?Â?lpiraten.                                                                                                 0.0
Ã?Â?stlich der Berge.                                                        

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
cosine_similarity(user_ratings_pivot.loc[278633, :].values.reshape(1,-1), user_ratings_pivot.loc[278188, :].values.reshape(1,-1))

array([[0.00544855]])

In [12]:
#if we repeat with very different users we might get negative values
#calcualting similarity matrix for the whole table
user_similarities = cosine_similarity(user_ratings_pivot)
cosine_similarity_df = pd.DataFrame(user_similarities,index=user_ratings_pivot.index,columns=user_ratings_pivot.index)

In [13]:
cosine_similarity_df.head()

user_id,243,254,507,638,643,741,882,929,1211,1424,...,277928,277965,278026,278137,278144,278188,278418,278582,278633,278843
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,1.0,-0.000671,0.003246,-0.009924,0.0,0.0,-0.01506,0.0,0.0,0.036062,...,-0.051133,0.01837,0.009457,0.0,0.0,-0.018649,-0.007926,0.0,0.007396,-0.021889
254,-0.000671,1.0,0.044545,-0.004259,0.0,0.001746,-0.011413,0.002926,0.004008,-0.001426,...,-0.001663,-0.004668,0.0,-0.005396,0.0,0.001122,-6.4e-05,-0.004629,0.005229,0.029656
507,0.003246,0.044545,1.0,0.01064,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.007842,-0.003252,0.0,0.0,0.0,0.000362,-0.000737,0.0,0.0
638,-0.009924,-0.004259,0.01064,1.0,0.0,0.0,0.024917,-0.006664,0.010012,0.007294,...,-0.000405,-0.01284,-0.011493,0.000273,0.0,-0.002556,0.00649,0.0,0.014291,-0.007407
643,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
display(user_similarities)

array([[ 1.00000000e+00, -6.70516017e-04,  3.24583954e-03, ...,
         0.00000000e+00,  7.39603389e-03, -2.18887291e-02],
       [-6.70516017e-04,  1.00000000e+00,  4.45454429e-02, ...,
        -4.62923768e-03,  5.22887461e-03,  2.96564467e-02],
       [ 3.24583954e-03,  4.45454429e-02,  1.00000000e+00, ...,
        -7.36886750e-04,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00, -4.62923768e-03, -7.36886750e-04, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 7.39603389e-03,  5.22887461e-03,  0.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  5.60741336e-03],
       [-2.18887291e-02,  2.96564467e-02,  0.00000000e+00, ...,
         0.00000000e+00,  5.60741336e-03,  1.00000000e+00]])

In [15]:
#tiding it up in a dataframe with the row and column names as the Ids of the users
cosine_similarity_df = pd.DataFrame(user_similarities, index=user_ratings_pivot.index,columns=user_ratings_pivot.index).fillna(0)

In [16]:
cosine_similarity_df.head()

user_id,243,254,507,638,643,741,882,929,1211,1424,...,277928,277965,278026,278137,278144,278188,278418,278582,278633,278843
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,1.0,-0.000671,0.003246,-0.009924,0.0,0.0,-0.01506,0.0,0.0,0.036062,...,-0.051133,0.01837,0.009457,0.0,0.0,-0.018649,-0.007926,0.0,0.007396,-0.021889
254,-0.000671,1.0,0.044545,-0.004259,0.0,0.001746,-0.011413,0.002926,0.004008,-0.001426,...,-0.001663,-0.004668,0.0,-0.005396,0.0,0.001122,-6.4e-05,-0.004629,0.005229,0.029656
507,0.003246,0.044545,1.0,0.01064,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.007842,-0.003252,0.0,0.0,0.0,0.000362,-0.000737,0.0,0.0
638,-0.009924,-0.004259,0.01064,1.0,0.0,0.0,0.024917,-0.006664,0.010012,0.007294,...,-0.000405,-0.01284,-0.011493,0.000273,0.0,-0.002556,0.00649,0.0,0.014291,-0.007407
643,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
cosine_similarity_series = cosine_similarity_df.loc[278633]

In [18]:
display(cosine_similarity_series)

user_id
243       0.007396
254       0.005229
507       0.000000
638       0.014291
643       0.000000
            ...   
278188    0.005449
278418   -0.000554
278582    0.000000
278633    1.000000
278843    0.005607
Name: 278633, Length: 3009, dtype: float64

In [19]:
#we order it
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

In [20]:
display(ordered_similarities)

user_id
278633    1.000000
222220    0.102248
49154     0.084150
190459    0.080884
258482    0.077058
            ...   
225617   -0.066955
58612    -0.072206
240035   -0.079142
6242     -0.084357
14326    -0.099551
Name: 278633, Length: 3009, dtype: float64

## KNN: find the most similar k neighbors to the usre in question, (here say k= 3) and then averages 
the ratings those usres gave to the item we are trying to get a rating for..
this gives us a predicted rating or how a user might feel about an item they haven't seen before

In [21]:
#pick the first 3
nearest_neighbors = ordered_similarities[1:4]

In [22]:
nearest_neighbors

user_id
222220    0.102248
49154     0.084150
190459    0.080884
Name: 278633, dtype: float64

In [23]:
neighbor_ratings = user_ratings_table.reindex(nearest_neighbors.index)

In [24]:
neighbor_ratings


book_title,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",Always Have Popsicles,Apple Magic (The Collector's series),"Ask Lily (Young Women of Faith: Lily Series, Book 5)",Beyond IBM: Leadership Marketing and Finance for the 1990s,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),Dark Justice,Deceived,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",Final Fantasy Anthology: Official Strategy Guide (Brady Games),...,Ã?Â?ber den ProzeÃ?Â? der Zivilisation 2.,Ã?Â?ber die Freiheit.,Ã?Â?ber die Pflicht zum Ungehorsam gegen den Staat.,Ã?Â?berallnie. AusgewÃ?Â¤hlte Gedichte 1928 - 1965.,Ã?Â?bermorgen.,Ã?Â?berraschung am Valentinstag.,Ã?Â?lpiraten.,Ã?Â?stlich der Berge.,Ã?Â?thique en toc,Ärger mit Produkt X. Roman.
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
222220,,,,,,,,,,,...,,,,,,,,,,
49154,,,,,,,,,,,...,,,,,,,,,,
190459,,,,,,,,,,,...,,,,,,,,,,


In [25]:
neighbor_ratings_non_empty = neighbor_ratings.fillna(0)
display(neighbor_ratings_non_empty)

book_title,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",Always Have Popsicles,Apple Magic (The Collector's series),"Ask Lily (Young Women of Faith: Lily Series, Book 5)",Beyond IBM: Leadership Marketing and Finance for the 1990s,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),Dark Justice,Deceived,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",Final Fantasy Anthology: Official Strategy Guide (Brady Games),...,Ã?Â?ber den ProzeÃ?Â? der Zivilisation 2.,Ã?Â?ber die Freiheit.,Ã?Â?ber die Pflicht zum Ungehorsam gegen den Staat.,Ã?Â?berallnie. AusgewÃ?Â¤hlte Gedichte 1928 - 1965.,Ã?Â?bermorgen.,Ã?Â?berraschung am Valentinstag.,Ã?Â?lpiraten.,Ã?Â?stlich der Berge.,Ã?Â?thique en toc,Ärger mit Produkt X. Roman.
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
222220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
190459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
def get_recommendations(user,title):
    cosine_similarity_series = cosine_similarity_df.loc[user]
    #we order it
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
    #pick the first 3
    #KNN: find the most similar k neighbors to the usre in question, (here say k= 3) and then averages 
    #the ratings those usres gave to the item we are trying to get a rating for..
    #this gives us a predicted rating or how a user might feel about an item they haven't seen before
    nearest_neighbors = ordered_similarities[1:4]
    neighbor_ratings = user_ratings_table.reindex(nearest_neighbors.index)
    return neighbor_ratings[title].mean()

In [58]:
user_ratings_table.loc[49154,'Jewel']

nan

In [59]:
get_recommendations(49154,'Jewel')

3.7260273972602738

In [60]:
get_recommendations(278633,'Cut')

nan

In [30]:
get_recommendations(278633,'Jewel')

nan

In [31]:
user_ratings_pivot.drop("Jewel", axis=1, inplace=True)

In [32]:
target_user = 49154

In [33]:
target_user_x = user_ratings_pivot.loc[[target_user]]

In [34]:
print(target_user_x)

book_title   A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)  \
user_id                                                                                                                  
49154                                                     0.0                                                            

book_title   Always Have Popsicles   Apple Magic (The Collector's series)  \
user_id                                                                     
49154                          0.0                                    0.0   

book_title   Ask Lily (Young Women of Faith: Lily Series, Book 5)  \
user_id                                                             
49154                                                     0.0       

book_title   Beyond IBM: Leadership Marketing and Finance for the 1990s  \
user_id                                                                   
49154                                      

In [62]:
#origina raw ratings for the item we are predicting
other_users_y = user_ratings_table["Jewel"]

In [49]:
print(other_users_y)

user_id
243            NaN
254            NaN
507            NaN
638            NaN
643            NaN
            ...   
278188         NaN
278418         NaN
278582         NaN
278633    3.726027
278843         NaN
Name: Jewel, Length: 3009, dtype: float64


In [63]:
#now we only care about the users that have rated the book so we drop the rest before making the similar neiboughr list 
other_users_x = user_ratings_pivot[other_users_y.notnull()]
print(other_users_x)

book_title   A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)  \
user_id                                                                                                                  
4017                                                      0.0                                                            
6251                                                      0.0                                                            
7158                                                      0.0                                                            
8245                                                      0.0                                                            
8487                                                      0.0                                                            
...                                                       ...                                                            
269566                  

In [64]:
#similary drop the rows in the ratings that are empty
other_users_y.dropna(inplace=True)
print(other_users_y)

user_id
4017      3.397129
6251     -2.146028
7158      6.976864
8245      4.113043
8487     -0.379032
            ...   
269566   -0.828924
269719   -0.464824
271448    3.788382
273979   -1.083333
278633    3.726027
Name: Jewel, Length: 130, dtype: float64


In [40]:
from sklearn.neighbors import KNeighborsRegressor

In [65]:
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=3)

In [66]:
#we fit it the same way we fit any model running it on our users and the raitngs for the specific book 
user_knn.fit(other_users_x, other_users_y)

KNeighborsRegressor(metric='cosine', n_neighbors=3)

In [67]:
user_user_pred = user_knn.predict(target_user_x)

In [68]:
print(user_user_pred)

[-0.88877237]


In [69]:
get_recommendations(49154,'Jewel')

3.7260273972602738