# Libraries and Utilities

In [2]:
import re
import nltk
import pickle
import difflib
import warnings
import numpy as np
import pandas as pd

# to ignore all the warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading and Checking the data

In [3]:
books = pd.read_csv('dataset/Preprocessed_data.csv')
books.head(3)

Unnamed: 0.1,Unnamed: 0,user_id,location,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l,Summary,Language,Category,city,state,country
0,0,2,"stockton, california, usa",18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
1,1,8,"timmins, ontario, canada",34.7439,2005018,5,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],timmins,ontario,canada
2,2,11400,"ottawa, ontario, canada",49.0,2005018,0,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"In a small town in Canada, Clara Callan reluct...",en,['Actresses'],ottawa,ontario,canada


# Data Preprocessing

In [4]:
df = books.copy()

# dropping null values and useless columns
df.dropna(inplace = True)
df.drop(columns = ['Unnamed: 0', 'user_id', 'location', 'age', 'isbn', 'year_of_publication', 'img_s', 'img_m',
                  'img_l', 'Language', 'city', 'state', 'country'], inplace = True)

# dropping rows with rating 0 and category 9
df.drop(index=df[df['rating'] == 0].index, inplace=True)
df.drop(index=df[df['Category'] == '9'].index, inplace=True)

# formatting the category column to keep only strings
df['Category'] = df['Category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
df.reset_index(drop = True, inplace = True)

# cutting the dataset small by removing books having less than 10 reviews
rating_counts = pd.DataFrame(df['book_title'].value_counts())
rare_books = rating_counts[rating_counts['book_title'] <= 10].index
common_books = df[~df['book_title'].isin(rare_books)]

# droping duplicates so that one book is in one row
book_list = common_books.drop_duplicates(subset = ['book_title'])

#since now we do not need ratings we can remove rating column as well
book_list.drop(columns = ['rating'], inplace = True)
book_list.reset_index(inplace = True)
book_list.head()

Unnamed: 0,index,book_title,book_author,publisher,Summary,Category
0,15,The Kitchen God's Wife,Amy Tan,Putnam Pub Group,A Chinese immigrant who is convinced she is dy...,Fiction
1,34,The Testament,John Grisham,Dell,"A suicidal billionaire, a burnt-out Washington...",Fiction
2,197,Beloved (Plume Contemporary Fiction),Toni Morrison,Plume,Staring unflinchingly into the abyss of slaver...,Fiction
3,274,Our Dumb Century: The Onion Presents 100 Years...,The Onion,Three Rivers Press,The staff of The Onion presents a satirical co...,Humor
4,296,Wild Animus,Rich Shapero,Too Far,"Wild animus is a search for the primordial, a ...",Fiction


# Finding similarity of the books with each other

In [5]:
# filtering the summary for common english words and removing them
filtered_summary = []
for i in book_list['Summary']:
               
    i = re.sub("[^a-zA-Z]"," ",i).lower()
    i = nltk.word_tokenize(i)
    i = [word for word in i if not word.lower() in set(stopwords.words("english"))]
    i = " ".join(i)
    filtered_summary.append(i)
        
book_list['Summary'] = filtered_summary

# combining all the features to use it as a single feature
book_list['combined_features'] = ''
combined_features = ['book_title', 'book_author', 'publisher', 'Category', 'Summary']
 
for feature in combined_features : 
    book_list['combined_features'] += (book_list[feature] + ' ')
         
book_list.drop(columns = ['index'], inplace = True)
book_list.reset_index(inplace = True)
  
# finding feature vectors using vectorizer
book_list['feature_vectors'] = ''
tf_vectorizer = TfidfVectorizer()
book_feature_vectors = tf_vectorizer.fit_transform(book_list['combined_features'])
   
# finding the cosine similarity for each book with every other book.
# similarity_matrix is a square matrix with shape [(book_list.length() x book_list.length())]
cosine_similarity_matrix = cosine_similarity(book_feature_vectors)

# Recommender Class

In [52]:
class RecommenderClass :
    
    def __init__(self, cosine_similarity_matrix):
        self.cosine_similarity_matrix = cosine_similarity_matrix
    
    def recommender(self, book_title):

        # getting the closest matching book title with the given input. Helps incase there is any mistake in spelling
        book_title = difflib.get_close_matches(book_title, book_list['book_title'], 1)[0]

        # finding the index of the book from the bookList and taking out its similarity with other books from similarity matrix
        index = book_list[book_list['book_title'] == book_title]['index'].values[0]
        similar_books = list(enumerate(self.cosine_similarity_matrix[index]))

        # sorting the books in descending order. The most similar book is recommended first
        sorted_similar_books = sorted(similar_books, key = lambda x : x[1], reverse = True)

        # taking index from sorted list and adding respective book title in the same order to return list of recommended books
        recommended_books = []
        for book in sorted_similar_books:
            recommended_books.append(book_list['book_title'][book[0]])

        return recommended_books
    
    # utility functions for using the recommendation algorithm in various ways
    def user_recommendation(self, book_title) :
        return self.recommender(book_title)[1:10]

    def search_recommendation(self, book_title) :
        return self.recommender(book_title)

# Testing the model

In [53]:
recommenderObject = RecommenderClass(similarity_matrix)
recommenderObject.user_recommendation('harry potter and the prizoner of azkaban')

['Harry Potter and the Order of the Phoenix (Book 5)',
 'Harry Potter and the Goblet of Fire (Book 4)',
 "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
 'Harry Potter and the Chamber of Secrets (Book 2)',
 "Harry Potter and the Sorcerer's Stone (Book 1)",
 'Harry Potter and the Chamber of Secrets Postcard Book',
 'Harry Potter und der Stein der Weisen',
 'Harry Potter und die Kammer des Schreckens',
 'The Concrete Blonde (A Harry Bosch Novel)']

# Pickle and exporting the model

In [61]:
# the model is stored in a file named recommendation_model.pkl
pickle_file = open('recommender_class.py', 'wb')
pickle.dump(RecommenderClass, pickle_file)
pickle_file.close()

In [62]:
pickle_file = open('similarity_matrix.py', 'wb')
pickle.dump(similarity_matrix, pickle_file)
pickle_file.close()

In [63]:
a = pickle.load(open('similarity_matrix.py', 'rb'))
b = pickle.load(open('recommender_class.py', 'rb'))
temp = b(a)
temp.user_recommendation('harry potter and the order of pheonix')[0]

'Harry Potter and the Prisoner of Azkaban (Book 3)'