Book Recommendation engine by *MAVERICK_GR*

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import random
print('Setup complete')

Setup complete


In [2]:
#Read the books dataset
df = pd.read_csv('C:\MLCourse\Books.csv')
df.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [3]:
#Check the size of the dataframe
df.shape

(271360, 8)

In [4]:
#Check for any duplicate ISBNs
df['ISBN'].duplicated().value_counts()

False    271360
Name: ISBN, dtype: int64

In [5]:
#Drop unnecessary features
df = df.drop(['ISBN','Year-Of-Publication','Image-URL-S', 'Image-URL-M', 'Image-URL-L'],axis=1)

In [6]:
#Reducing the size of the dataframe so that we dont face any memory issues later
#Randomly choosing 10% of the original data
df_new = df.sample(frac=0.1)
df_new.shape

(27136, 3)

In [7]:
#Drop any null values
df_new.isna().sum()

Book-Title     0
Book-Author    0
Publisher      0
dtype: int64

In [8]:
#Drop the existing index and create a new for the current subset of dataframe
df_new = df_new.reset_index(drop=True)
df_new['Book_id'] = np.arange(df_new.shape[0])

In [9]:
#Grab the necessary features 
def collect_features(data):
    features = []
    for i in range(0,data.shape[0]):
        features.append(str(data['Book-Title'][i])+' '+str(data['Book-Author'][i])+' '+str(data['Publisher'][i]))
    return features

In [10]:
#Create a new column with accumilated features
df_new['feature'] = collect_features(df_new)
df_new.head()


Unnamed: 0,Book-Title,Book-Author,Publisher,Book_id,feature
0,Just Annoying!,Andy Griffiths,Pan Books Ltd,0,Just Annoying! Andy Griffiths Pan Books Ltd
1,The New Low-Country Cooking: 125 Recipes for C...,Marvin Woods,Morrow Cookbooks,1,The New Low-Country Cooking: 125 Recipes for C...
2,Kiss of the Highlander,Karen Marie Moning,Dell Publishing,2,Kiss of the Highlander Karen Marie Moning Dell...
3,The New Essential Guide to Characters (Star Wars),Daniel Wallace,Del Rey Books,3,The New Essential Guide to Characters (Star Wa...
4,A Game of Chance (Zebra Regency Romance),Lynn Collum,Zebra Books,4,A Game of Chance (Zebra Regency Romance) Lynn ...


In [11]:
#Apply the countvectorizer to the feature column
cv = CountVectorizer().fit_transform(df_new['feature'])

In [12]:
#Find the cosine similarity
cs = cosine_similarity(cv)

In [13]:
#Print the cosine similarity
print(cs)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.0745356  ... 0.06454972 0.07071068 0.        ]
 [0.         0.0745356  1.         ... 0.09622504 0.10540926 0.23570226]
 ...
 [0.         0.06454972 0.09622504 ... 1.         0.09128709 0.        ]
 [0.         0.07071068 0.10540926 ... 0.09128709 1.         0.        ]
 [0.         0.         0.23570226 ... 0.         0.         1.        ]]


In [14]:
#Giving user choices to choose from 10 random books
#Randomly printing 10 different books eachtime
books = df_new['Book-Title']
for i in range(0,9):
    book = random.choice(books)
    book_id = df_new.loc[df_new['Book-Title'] == book, 'Book_id'].iloc[0]
    print(book_id, book)
    i = i +1

8672 Bad Girls of the Bible : And What We Can Learn from Them
12270 Goodbye Dear Friend: Coming to Terms With the Death of a Pet
4070 AT DEATHS DOOR
5263 Les jolies choses: Roman
14737 God Came Near
26051 Harry and Hortense at Hormone High (Charlotte Zolotow Book)
7943 The Imagination of the Heart
25177 YOU BELONG TO ME LARGE PRINT EDITION
5893 Dream Date


In [15]:
#Ask user to chose a book and enter the corresponding book-id
Book_id = input('Enter the book-ID of your choice:')

Enter the book-ID of your choice:5893


In [16]:
#Find the cosine similarity score for the corresponding book
scores = list(enumerate(cs[int(Book_id)]))

In [17]:
#Sort the scores in descending order and remove the book itself from the list
sorted_scores = sorted(scores, key = lambda x:x[1], reverse=True)
sorted_scores = sorted_scores[1:]

In [18]:
#Printing the list of 05 most recommended books which are similar to user's choice
j=0
print('The 5 most recommended books similar to your choice are:::\n')
for i in sorted_scores:
    book_title = df_new[df_new['Book_id'] == i[0]]['Book-Title'].values[0]
    print(j+1 , book_title)
    j = j + 1
    if (j>=5):
        break

The 5 most recommended books similar to your choice are:::

1 The Waitress (Point)
2 Hiawatha
3 Smith: Killers of the Dream Revised (Paper)
4 Dream Team 1996 Scrapbook
5 I Have a Dream
