In [2]:
import pandas as pd


df_books = pd.read_csv(
    "Books.csv",
    usecols=['ISBN', 'Book_Title', 'Book_Author'],
    dtype={'ISBN': 'str', 'Book_Title': 'str', 'Book_Author': 'str'})

In [53]:
df_books

Unnamed: 0,ISBN,Book_Title,Book_Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271355,440400988,There's a Bat in Bunk Five,Paula Danziger
271356,525447644,From One to One Hundred,Teri Sloat
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271358,192126040,Republic (World's Classics),Plato


In [3]:
df_ratings=pd.read_csv("Ratings.csv",
                      dtype={'User-ID': 'int32', 'ISBN': 'str', 'BookRating': 'float32'})

df_ratings

Unnamed: 0,User-ID,ISBN,Book_Rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6
...,...,...,...
1048570,250764,451410777,0
1048571,250764,452264464,8
1048572,250764,048623715X,0
1048573,250764,486256588,0


In [4]:
df_books.head()

Unnamed: 0,ISBN,Book_Title,Book_Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book_Rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [6]:
df_books.isnull().sum()

ISBN           0
Book_Title     0
Book_Author    1
dtype: int64

In [7]:
df_ratings.isnull().sum()

User-ID        0
ISBN           0
Book_Rating    0
dtype: int64

In [8]:
df_books.dropna(inplace=True)

In [9]:
df_books.isnull().sum()

ISBN           0
Book_Title     0
Book_Author    0
dtype: int64

In [10]:
df_ratings.shape

(1048575, 3)

In [11]:
ratings=df_ratings['User-ID'].value_counts()

In [12]:
ratings.sort_values(ascending=False).head()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
Name: User-ID, dtype: int64

In [13]:
len(ratings[ratings<200])

94693

In [14]:
df_ratings['User-ID'].isin(ratings[ratings<200].index).sum()

564847

In [15]:
df_ratings_rm = df_ratings[
  ~df_ratings['User-ID'].isin(ratings[ratings < 200].index)
]
df_ratings_rm.shape

(483728, 3)

## Remove books with less than 100 ratings

In [16]:
ratings = df_ratings['ISBN'].value_counts() # we have to use the original df_ratings to pass the challenge
ratings.sort_values(ascending=False).head()

971880107    2264
316666343    1164
385504209     813
312195516     668
60928336      662
Name: ISBN, dtype: int64

In [17]:
len(ratings[ratings < 100])

321846

In [18]:
df_books['ISBN'].isin(ratings[ratings < 100].index).sum()

257190

In [19]:
df_ratings_rm = df_ratings_rm[
  ~df_ratings_rm['ISBN'].isin(ratings[ratings < 100].index)
]
df_ratings_rm.shape

(41225, 3)

In [20]:
# These should exist
books = ["Where the Heart Is (Oprah's Book Club (Paperback))",
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True"]

for book in books:
  print(df_ratings_rm.ISBN.isin(df_books[df_books.Book_Title == book].ISBN).sum())

165
66
44
51
70


## Prepare Dataset For KNN

In [21]:
df = df_ratings_rm.pivot_table(index=['User-ID'],columns=['ISBN'],values='Book_Rating').fillna(0).T
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,249628,249862,249894,250184,250405,250764,277427,277478,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006099486X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006101351X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
014023313X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df.index = df.join(df_books.set_index('ISBN'))['Book_Title']

In [23]:
df = df.sort_index()
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,249628,249862,249894,250184,250405,250764,277427,277478,277639,278418
Book_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df.loc["The Queen of the Damned (Vampire Chronicles (Paperback))"][:5]

User-ID
254     0.0
2276    0.0
2766    0.0
2977    0.0
3363    0.0
Name: The Queen of the Damned (Vampire Chronicles (Paperback)), dtype: float64

## Build Model

In [25]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric='cosine')
model.fit(df.values)

NearestNeighbors(metric='cosine')

In [26]:
df.iloc[0].shape

(804,)

In [27]:
title = 'The Queen of the Damned (Vampire Chronicles (Paperback))'
df.loc[title].shape

(804,)

In [28]:
distance, indice = model.kneighbors([df.loc[title].values], n_neighbors=6)

print(distance)
print(indice)

[[0.         0.50632715 0.59162423 0.6914091  0.76277579 0.76747403]]
[[529 566 557 237 573 470]]


In [29]:
df.iloc[indice[0]].index.values

array(['The Queen of the Damned (Vampire Chronicles (Paperback))',
       'The Vampire Lestat (Vampire Chronicles, Book II)',
       'The Tale of the Body Thief (Vampire Chronicles (Paperback))',
       'Interview with the Vampire',
       'The Witching Hour (Lives of the Mayfair Witches)',
       'The Gunslinger (The Dark Tower, Book 1)'], dtype=object)

In [30]:
pd.DataFrame({
    'title'   : df.iloc[indice[0]].index.values,
    'distance': distance[0]
}) \
.sort_values(by='distance', ascending=False)

Unnamed: 0,title,distance
5,"The Gunslinger (The Dark Tower, Book 1)",0.767474
4,The Witching Hour (Lives of the Mayfair Witches),0.762776
3,Interview with the Vampire,0.691409
2,The Tale of the Body Thief (Vampire Chronicles...,0.591624
1,"The Vampire Lestat (Vampire Chronicles, Book II)",0.506327
0,The Queen of the Damned (Vampire Chronicles (P...,0.0


In [53]:
# function to return recommended books - this will be tested
def get_recommends(title = ""):
  try:
    book = df.loc[title]
  except KeyError as e:
    print('The given book', e, 'does not exist')
    return

  distance, indice = model.kneighbors([book.values], n_neighbors=7)

  recommended_books = pd.DataFrame({
      'title'   : df.iloc[indice[0]].index.values,
      'distance': distance[0]
    }) \
    .sort_values(by='distance', ascending=False) \
    .head(5).values

  return [recommended_books]

In [67]:
rec=get_recommends("The Vampire Lestat (Vampire Chronicles, Book II)")
rec

[array([['The Gunslinger (The Dark Tower, Book 1)', 0.7372387572544652],
        ['The Weight of Water : A Novel Tag: Author of Resistance and Strange Fits of Passion',
         0.7320593249978466],
        ['The Witching Hour (Lives of the Mayfair Witches)',
         0.6847876834340636],
        ['Interview with the Vampire', 0.5883247297704758],
        ['The Queen of the Damned (Vampire Chronicles (Paperback))',
         0.5063271472931488]], dtype=object)]

In [77]:
str(rec[0][0][0])

'The Gunslinger (The Dark Tower, Book 1)'

## 

In [78]:
rec[0][1]

array(['The Weight of Water : A Novel Tag: Author of Resistance and Strange Fits of Passion',
       0.7320593249978466], dtype=object)

In [73]:
rec[0][2]

'The Witching Hour (Lives of the Mayfair Witches)'

In [74]:
rec[0][3][0]

'Interview with the Vampire'

In [75]:
rec[0][4][0]

'The Queen of the Damned (Vampire Chronicles (Paperback))'