# Book Recommender System 

#### by Gautam Mahto

## Import Essential Libraries

In [1]:
import numpy as np 
import pandas as pd

## Import Datasets

In [2]:
book1=pd.read_csv('C:\\Users\\GAUTAM\\Desktop\\Recommender System\\dataset\\books.csv')
book2=pd.read_csv('C:\\Users\\GAUTAM\\Desktop\\Recommender System\\dataset\\books_new.csv')

### look the shape and head of the DataFrame

In [3]:
book1.shape

(211, 5)

In [4]:
book2.shape

(211, 6)

In [5]:
book1.head()

Unnamed: 0,Title,Author,Genre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",signal_processing,228,Wiley
1,Data Smart,"Foreman, John",data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",economics,179,HarperCollins
4,Orientalism,"Said, Edward",history,197,Penguin


In [6]:
book2.head()

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin


### Here We conclude that book 1 and book 2 has same no of rows and values but book2 has one different column so for the data Preprocessing we will take book2 as a main DataFrame

In [7]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)


In [8]:
book2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      211 non-null    object
 1   Author     187 non-null    object
 2   Genre      211 non-null    object
 3   SubGenre   211 non-null    object
 4   Height     211 non-null    int64 
 5   Publisher  115 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.0+ KB


### we will drop that columns which are not necessary

In [9]:
book2.drop(columns=['Publisher','Height'],inplace=True)

In [10]:
book2.isnull().sum()

Title        0
Author      24
Genre        0
SubGenre     0
dtype: int64

Drop the null Values

In [11]:
book2.dropna(inplace=True)

In [12]:
book2.isnull().sum()

Title       0
Author      0
Genre       0
SubGenre    0
dtype: int64

Reset the Index

In [13]:
book2.reset_index(drop=True,inplace=True)

## here values are not in the systematic Form so we will convert into the Systematic form

In [14]:
new_lst=[]
for i in book2['Author']:
    if(',' in i):
        name = i
        parts = name.split(", ")
        new_name = "{} {}".format(parts[1], parts[0])
        new_lst.append(new_name)
    else:
        new_lst.append(i)
    

In [15]:
book2['Author']=new_lst

In [16]:
new_lst=[]
for i in book2['Title']:
    if(',' in i):
        name = i
        parts = name.split(", ")
        new_name = "{} {}".format(parts[1], parts[0])
        new_lst.append(new_name)
    else:
        new_lst.append(i)
    

In [17]:
book2['Title']=new_lst

In [18]:
book2.head()

Unnamed: 0,Title,Author,Genre,SubGenre
0,Fundamentals of Wavelets,Jaideva Goswami,tech,signal_processing
1,Data Smart,John Foreman,tech,data_science
2,God Created the Integers,Stephen Hawking,tech,mathematics
3,Superfreakonomics,Stephen Dubner,science,economics
4,Orientalism,Edward Said,nonfiction,history


In [19]:
book2

Unnamed: 0,Title,Author,Genre,SubGenre
0,Fundamentals of Wavelets,Jaideva Goswami,tech,signal_processing
1,Data Smart,John Foreman,tech,data_science
2,God Created the Integers,Stephen Hawking,tech,mathematics
3,Superfreakonomics,Stephen Dubner,science,economics
4,Orientalism,Edward Said,nonfiction,history
5,The Nature of Statistical Learning Theory,Vladimir Vapnik,tech,data_science
6,Integration of the Indian States,V P Menon,nonfiction,history
7,The Drunkard's Walk,Leonard Mlodinow,science,mathematics
8,Image Processing & Mathematical Morphology,Frank Shih,tech,signal_processing
9,How to Think Like Sherlock Holmes,Maria Konnikova,nonfiction,psychology


### Here we will Convert the string obj into the List

In [20]:
def convert(text):
    l=[text]
    return l

In [21]:
column_lst=['Author','Genre','SubGenre']

In [22]:
for i in column_lst:
    book2[i]=book2[i].apply(convert)

In [23]:
book2.head()

Unnamed: 0,Title,Author,Genre,SubGenre
0,Fundamentals of Wavelets,[Jaideva Goswami],[tech],[signal_processing]
1,Data Smart,[John Foreman],[tech],[data_science]
2,God Created the Integers,[Stephen Hawking],[tech],[mathematics]
3,Superfreakonomics,[Stephen Dubner],[science],[economics]
4,Orientalism,[Edward Said],[nonfiction],[history]


### here we will create the tag column

In [24]:
book2['Tag']=book2['Author']+book2['Genre']+book2['SubGenre']

### Here we will crearte the new dataframe with necessary  columns

In [25]:
new_df=book2.drop(columns=['Author','Genre','SubGenre'])

In [26]:
new_df['Tag'] = new_df['Tag'].apply(lambda x: " ".join(x))
new_df.head()

Unnamed: 0,Title,Tag
0,Fundamentals of Wavelets,Jaideva Goswami tech signal_processing
1,Data Smart,John Foreman tech data_science
2,God Created the Integers,Stephen Hawking tech mathematics
3,Superfreakonomics,Stephen Dubner science economics
4,Orientalism,Edward Said nonfiction history


### here we will perform CountVectorizer

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=200,stop_words='english')

In [47]:
cv.get_feature_names()



['abraham',
 'ackroyd',
 'aczel',
 'adam',
 'adolf',
 'albert',
 'aldous',
 'alex',
 'alfred',
 'allen',
 'amartya',
 'amir',
 'amitav',
 'andrew',
 'andy',
 'anthology',
 'archer',
 'arthur',
 'autobiography',
 'ayn',
 'bach',
 'baz',
 'bbc',
 'bell',
 'bertrand',
 'bob',
 'bodanis',
 'bradsky',
 'braithwaite',
 'brown',
 'camus',
 'capra',
 'carl',
 'cedric',
 'charles',
 'classic',
 'computer_science',
 'conan',
 'conway',
 'corbett',
 'cormen',
 'crichton',
 'dalrymple',
 'dan',
 'data_science',
 'david',
 'dawkins',
 'deb',
 'deshpande',
 'devlin',
 'dickens',
 'dickinson',
 'dominique',
 'dostoevsky',
 'downey',
 'doyle',
 'drew',
 'drucker',
 'dubner',
 'duda',
 'durant',
 'durrell',
 'dylan',
 'earle',
 'economics',
 'eddins',
 'edgar',
 'education',
 'edward',
 'eraly',
 'eric',
 'ernest',
 'feynman',
 'fiction',
 'fisk',
 'follett',
 'foreman',
 'forsyth',
 'frank',
 'frederick',
 'fritjof',
 'fyodor',
 'george',
 'gerald',
 'gordon',
 'hawking',
 'history',
 'jaideva',
 'jam

In [39]:
vector = cv.fit_transform(new_df['Tag']).toarray()

In [40]:
vector.shape


(187, 200)

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
similarity = cosine_similarity(vector)

In [49]:
similarity

array([[1.        , 0.28867513, 0.28867513, ..., 0.        , 0.        ,
        0.        ],
       [0.28867513, 1.        , 0.25      , ..., 0.        , 0.        ,
        0.        ],
       [0.28867513, 0.25      , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.5       ,
        0.5       ],
       [0.        , 0.        , 0.        , ..., 0.5       , 1.        ,
        0.5       ],
       [0.        , 0.        , 0.        , ..., 0.5       , 0.5       ,
        1.        ]])

In [44]:
new_df[new_df['Title'] == 'Data Smart'].index[0]

1

### here we will Predict the output

In [45]:
def recommend(book,n):
    df = pd.DataFrame(columns=['Title', 'Author'])
    index = new_df[new_df['Title'] == book].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:n]:
        a=new_df.iloc[i[0]].Title
        b=book2.iloc[i[0]].Author
        df.loc[len(df)] = [a, b]
    return df

In [46]:
recommend('Python for Data Analysis',10)

Unnamed: 0,Title,Author
0,Data Scientists at Work,[Sebastian Gutierrez]
1,Soft Computing & Intelligent Systems,[Madan Gupta]
2,Pattern Classification,[Hart Duda]
3,Neural Networks,[Simon Haykin]
4,Data Smart,[John Foreman]
5,The Nature of Statistical Learning Theory,[Vladimir Vapnik]
6,Statistical Decision Theory',[John Pratt]
7,Data Mining Handbook,[Robert Nisbet]
8,Machine Learning for Hackers,[Drew Conway]


### Import Pickle Module and Save the ML Model 

In [37]:
import pickle
