In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

import warnings 
warnings.filterwarnings('ignore')

<h1>Data Preprocessing</h1>

<h2>Books</h2>

In [2]:
books_df = pd.read_csv('./csv/Books.csv')
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
# check columns
books_df.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [4]:
# Remove unwanted columns
books_df = books_df[['ISBN', 'Book-Title', 'Book-Author']]
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [41]:
books_df.isna().sum()

ISBN           0
Book-Title     0
Book-Author    0
dtype: int64

<h2>Ratings</h2>

In [5]:
ratings_df = pd.read_csv('./csv/Ratings.csv')
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [40]:
ratings_df.shape

(1149780, 3)

In [39]:
ratings_df['Book-Rating'].value_counts()

Book-Rating
0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: count, dtype: int64

<h2>check and remove NaN values</h2>

In [6]:
books_df.dropna(inplace=True)

In [7]:
books_df.isna().sum()

ISBN           0
Book-Title     0
Book-Author    0
dtype: int64

In [8]:
ratings_df.isna().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [9]:
# Shape of data
books_df.shape, ratings_df.shape

((271358, 3), (1149780, 3))

<h2>sort ratings</h2>

In [42]:
# Calc ratings given by each user and store it in 'ratings' variable
ratings = ratings_df['User-ID'].value_counts()
ratings.sort_values(ascending=False).head()
ratings

User-ID
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

<h2>Check users with less than 200 ratings</h2>

In [11]:
len(ratings[ratings < 200])

104378

In [12]:
ratings_df['User-ID'].isin(ratings[ratings < 200].index).sum()

622224

In [13]:
# Updated/removed ratings df
rm_ratings_df = ratings_df[~ratings_df['User-ID'].isin(ratings[ratings < 200].index)]
rm_ratings_df.shape

(527556, 3)

<h2>Check books with less than 100 ratings</h2>

In [14]:
ratings = ratings_df.ISBN.value_counts()
ratings.sort_values(ascending=False).head()

ISBN
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: count, dtype: int64

In [15]:
len(ratings[ratings < 100])

339825

In [16]:
books_df.ISBN.isin(ratings[ratings < 100].index).sum()

269422

In [17]:
rm_ratings_df = rm_ratings_df[~rm_ratings_df.ISBN.isin(ratings[ratings < 100].index)]
rm_ratings_df.head(), rm_ratings_df.shape

(      User-ID        ISBN  Book-Rating
 1456   277427  002542730X           10
 1469   277427  0060930535            0
 1471   277427  0060934417            0
 1474   277427  0061009059            9
 1484   277427  0140067477            0,
 (49781, 3))

In [18]:
# checking some books
books = ["Where the Heart Is (Oprah's Book Club (Paperback))",
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True"]
for book in books:
    print(rm_ratings_df.ISBN.isin(books_df[books_df['Book-Title'] == book]['ISBN']).sum())

183
75
49
57
77


<h1>Preprocess data for ML</h1>

In [19]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [20]:
rm_ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1469,277427,0060930535,0
1471,277427,0060934417,0
1474,277427,0061009059,9
1484,277427,0140067477,0


In [21]:
df = rm_ratings_df.pivot_table(index=['User-ID'], columns=['ISBN'], values='Book-Rating').fillna(0).T
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df.index = df.join(books_df.set_index('ISBN'))['Book-Title']

In [23]:
df = df.sort_index()
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df.to_csv('./csv/df.csv')

User-ID
254     0.0
2276    0.0
2766    0.0
2977    0.0
3363    0.0
Name: The Queen of the Damned (Vampire Chronicles (Paperback)), dtype: float64

<h1>Create Modle KNN</h1>

In [26]:
model = NearestNeighbors(metric='cosine')
model.fit(df.values)

In [27]:
df.iloc[0].shape

(888,)

In [28]:
title = 'The Queen of the Damned (Vampire Chronicles (Paperback))'
df.loc[title].shape

(888,)

In [29]:
distance, indice = model.kneighbors([df.loc[title].values], n_neighbors=6)

print(distance, indice)

[[1.11022302e-16 5.17841186e-01 5.37633845e-01 7.34506886e-01
  7.44865700e-01 7.93983542e-01]] [[612 660 648 272 667 110]]


In [30]:
df.iloc[indice[0]].index.values

array(['The Queen of the Damned (Vampire Chronicles (Paperback))',
       'The Vampire Lestat (Vampire Chronicles, Book II)',
       'The Tale of the Body Thief (Vampire Chronicles (Paperback))',
       'Interview with the Vampire',
       'The Witching Hour (Lives of the Mayfair Witches)', 'Catch 22'],
      dtype=object)

In [31]:
pd.DataFrame({
    'title': df.iloc[indice[0]].index.values, 
    'distance': distance[0]}
            ).sort_values(by='distance', ascending=False)

Unnamed: 0,title,distance
5,Catch 22,0.7939835
4,The Witching Hour (Lives of the Mayfair Witches),0.7448657
3,Interview with the Vampire,0.7345069
2,The Tale of the Body Thief (Vampire Chronicles...,0.5376338
1,"The Vampire Lestat (Vampire Chronicles, Book II)",0.5178412
0,The Queen of the Damned (Vampire Chronicles (P...,1.110223e-16


<h2>Create recommender method</h2>

In [32]:
def recommend(title=''):
    try:
        book = df.loc[title]
    except KeyError as e:
        print(f'The given book {e} does not exist')
        return
        
    distance, indice = model.kneighbors([book.values], n_neighbors=6)

    recommended_books = pd.DataFrame({
        'title': df.iloc[indice[0]].index.values,
        'distance': distance[0]
    }).sort_values(by='distance').head(5).values

    return [title, recommended_books]

In [33]:
books = recommend('The Queen of the Damned (Vampire Chronicles (Paperback))')
books

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 array([['The Queen of the Damned (Vampire Chronicles (Paperback))',
         1.1102230246251565e-16],
        ['The Vampire Lestat (Vampire Chronicles, Book II)',
         0.5178411864186413],
        ['The Tale of the Body Thief (Vampire Chronicles (Paperback))',
         0.5376338446489461],
        ['Interview with the Vampire', 0.7345068863988313],
        ['The Witching Hour (Lives of the Mayfair Witches)',
         0.7448657003312193]], dtype=object)]

In [34]:
recommend('Harry potter')

The given book 'Harry potter' does not exist


In [35]:
recommend('into the wild')

The given book 'into the wild' does not exist


In [36]:
recommend('Dances With Wolves')

['Dances With Wolves',
 array([['Dances With Wolves', 0.0],
        ['The Thorn Birds', 0.760686293431829],
        ['The Sum of All Fears', 0.7759316810206138],
        ['The Street Lawyer', 0.8038553023777428],
        ["I'll Be Seeing You", 0.8139950802978972]], dtype=object)]