In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### 1.Load Datasets

In [2]:
books_df=pd.read_excel(r"C:\Users\SIREESHA\Desktop\br1.xlsx")
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
user_df=pd.read_excel(r"C:\Users\SIREESHA\Desktop\users.xlsx")
user_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
ratings_df=pd.read_excel(r"E:\rating.xlsx")
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


### 2.Data Preprocessing

In [5]:
books_df.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              1
Image-URL-S            1
Image-URL-M            1
Image-URL-L            1
dtype: int64

In [7]:
user_df.isnull().sum()

User-ID          0
Location         1
Age         110762
dtype: int64

In [8]:
ratings_df.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [14]:
# Merging data into one dataframe
book_ratings_df= ratings_df.merge(books_df, on='ISBN')
user_book_ratings_df =book_ratings_df.merge(user_df, on='User-ID')

In [15]:
# First look at data
user_book_ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Location,Age
0,276746,425115801,0,Lightning,Dean R. Koontz,1996,Berkley Publishing Group,http://images.amazon.com/images/P/0425115801.0...,http://images.amazon.com/images/P/0425115801.0...,http://images.amazon.com/images/P/0425115801.0...,"fort worth, ,",
1,276746,449006522,0,Manhattan Hunt Club,JOHN SAUL,2002,Ballantine Books,http://images.amazon.com/images/P/0449006522.0...,http://images.amazon.com/images/P/0449006522.0...,http://images.amazon.com/images/P/0449006522.0...,"fort worth, ,",
2,276746,553561618,0,Dark Paradise,TAMI HOAG,1994,Bantam,http://images.amazon.com/images/P/0553561618.0...,http://images.amazon.com/images/P/0553561618.0...,http://images.amazon.com/images/P/0553561618.0...,"fort worth, ,",
3,276746,055356451X,0,Night Sins,TAMI HOAG,1995,Bantam,http://images.amazon.com/images/P/055356451X.0...,http://images.amazon.com/images/P/055356451X.0...,http://images.amazon.com/images/P/055356451X.0...,"fort worth, ,",
4,277427,425115801,0,Lightning,Dean R. Koontz,1996,Berkley Publishing Group,http://images.amazon.com/images/P/0425115801.0...,http://images.amazon.com/images/P/0425115801.0...,http://images.amazon.com/images/P/0425115801.0...,"gilbert, arizona, usa",48.0


In [21]:
# Creating a dictionary of unique Book-Titles and creating a new column
book_2_id_dict ={}
for idx, book in enumerate(user_book_ratings_df['Book-Title'].unique()):
    book_2_id_dict[book] = idx
user_book_ratings_df['Book-ID'] = user_book_ratings_df['Book-Title'].map(book_2_id_dict)

In [22]:
# Creating a dictionary of unique User-IDs and creating a new column
user_dict = {}
for idx, user in enumerate(user_book_ratings_df['User-ID'].unique()):
    user_dict[user] = idx
user_book_ratings_df['New-User-ID'] = user_book_ratings_df['User-ID'].map(user_dict)

In [23]:
#Creating Final DF to use
final_df = user_book_ratings_df[['New-User-ID', 'Book-Title', 'Book-Rating']]

In [24]:
# Final DF look
final_df = final_df.rename(columns={'New-User-ID': 'User-ID'})
final_df.head()

Unnamed: 0,User-ID,Book-Title,Book-Rating
0,0,Lightning,0
1,0,Manhattan Hunt Club,0
2,0,Dark Paradise,0
3,0,Night Sins,0
4,1,Lightning,0


### 3,Data Preparation

In [25]:
# Filtering out books with < 200 num of ratings
num_ratings = final_df.groupby('Book-Title').count()['Book-Rating'].to_frame()
num_ratings.columns = ['Num-Ratings']
num_ratings.reset_index(inplace=True)
num_ratings = num_ratings[num_ratings['Num-Ratings'] > 200]

In [27]:
# Creating a pivot table and filling na values with 0
pivot_df = merged_df.pivot(index='Book-Title', columns='User-ID',
               values='Book-Rating').fillna(0)

In [28]:
pivot_df

User-ID,1,5,6,7,8,10,11,12,13,14,...,24930,24931,24932,24933,24934,24935,24936,24937,24938,24939
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Heartbreaking Work of Staggering Genius,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Painted House,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Prayer for Owen Meany,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Time to Kill,0.0,0.0,0.0,9.0,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Where the Heart Is (Oprah's Book Club (Paperback)),0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
While I Was Gone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
White Oleander : A Novel,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,...,8.0,7.0,10.0,9.0,0.0,9.0,0.0,9.0,0.0,0.0
Wicked: The Life and Times of the Wicked Witch of the West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4.Model Building

In [29]:
# Creating a cosine similarity martix
similarity_df = cosine_similarity(pivot_df)

In [30]:
# Initializing Nearest Neighbours model
model = NearestNeighbors(n_neighbors=6,algorithm='brute')

In [31]:
# Fitting model
model.fit(similarity_df)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                 radius=1.0)

In [32]:
# Fitting model
sample_title = np.random.choice(pivot_df.index.values)
sample_title_index = pivot_df.index.values.tolist().index(sample_title)
print(f"Sample Title: {sample_title}")
print(f"Sample Title Index: {sample_title_index}")

Sample Title: A Time to Kill
Sample Title Index: 4


In [33]:
# Retrieving suggestions and distances
distance, suggestions = model.kneighbors(similarity_df[sample_title_index, :].reshape(1, -1))

In [34]:
# Function to recommend
def recommend(book_title):
    list_of_books = pivot_df.index.values.tolist()
    if book_title in list_of_books:
        title_index = list_of_books.index(book_title)
        distance, suggestions = model.kneighbors(similarity_df[title_index, :].reshape(1, -1))
        distance, suggestions = distance[0][1:].tolist(), suggestions[0][1:].tolist()
        book_names = [list_of_books[i] for i in suggestions]
        recommendation_df = pd.DataFrame({'Book-Title': book_names,
                                         'Similarity-Score': distance})

        recommendation_df['Similarity-Score'] = recommendation_df['Similarity-Score'].apply(lambda x: 2.0 - x if x > 1 else x)
        recommendation_df.sort_values('Similarity-Score', ascending=False, inplace=True)
        print("-"*50)
        print(f"Here are top 5 recommendation for the book title : {book_title}")
        print("-"*50)
        return recommendation_df
    else:
        print("ERROR: Couldn't find the Book Title in the database")
        print("-"*50)
        suggest_books = []
        print("You can try with the below titles:")
        print("-"*50)
        for i in range(5):
            print(np.random.choice(list_of_books))

In [35]:
recommend("The Girls' Guide to Hunting and Fishing")

--------------------------------------------------
Here are top 5 recommendation for the book title : The Girls' Guide to Hunting and Fishing
--------------------------------------------------


Unnamed: 0,Book-Title,Similarity-Score
0,Good in Bed,0.674752
1,Confessions of a Shopaholic (Summer Display Op...,0.673185
2,The Nanny Diaries: A Novel,0.670402
3,The Poisonwood Bible: A Novel,0.656205
4,Here on Earth,0.651041
