In [None]:
import re
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install sklearn


In [None]:
!pip3 install pandas

Dataset

In [None]:
books = pd.read_csv("BRS-Dataset/Books.csv") ## 73.29 MB Dataset size
users = pd.read_csv("BRS-Dataset/Users.csv") ## 22.63 MB Dataset size
ratings = pd.read_csv("BRS-Dataset/Book-Ratings.csv") ## 11.02 MB Dataset size

In [None]:
## Total size of this Data set is 106.94 MB

print("Books Data:    ", books.shape)
print("Users Data:    ", users.shape)
print("Books-ratings: ", ratings.shape)

### Processing

### Books Dataset Processing

In [None]:
## Printing the header of the column names
print("Columns: ", list(books.columns))


In [None]:
# Exploring books dataset
books

In [None]:

#Renaming the Colimn names
new_books = books.rename(columns={"Image-URL-M":"Cover_Page", "Image-URL-S":"Book-URL"})

new_books.head()

In [None]:
## Drop URL columns
new_books.drop(['Image-URL-L'], axis=1, inplace=True)
new_books.head()

In [None]:
## Making Book-URL Clickable and show image for Coverpage

def make_clickable(val):
    return '<a target="_blank" href="{}">Amazon</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)


new_books[~new_books["ISBN"].isin(new_books)].head(10).style.format({'Book-URL': make_clickable, 'Cover_Page': show_image})


In [None]:
## Checking the datatypes of the column 
new_books.info()

In [None]:
# What is the shape of this DataFrame? How many columns does each row have? How many rows are there?
new_books.info()

In [None]:
## Checking for null values
new_books.isnull().sum() 

In [None]:
## Drop URL columns
new_books.drop(['Book-URL', 'Cover_Page'], axis=1, inplace=True)
new_books.head()

In [None]:
## Checking the location for null values for Book-Author
new_books.loc[new_books['Book-Author'].isnull(),:]

In [None]:
## Checking the location for null values for Publisher
new_books.loc[new_books['Publisher'].isnull(),:]

In [None]:
new_books.at[187689 ,'Book-Author'] = 'Other'

new_books.at[128890 ,'Publisher'] = 'Other'
new_books.at[129037 ,'Publisher'] = 'Other'

In [None]:
# Checking for column Year-of-publication
new_books['Year-Of-Publication'].unique()

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
## Checking the location  for DK Publishing Inc
#investigating the rows having 'DK Publishing Inc' as yearOfPublication

new_books.loc[new_books['Year-Of-Publication'] == 'DK Publishing Inc',:]

In [None]:
new_books.loc[new_books['Year-Of-Publication'] == 'Gallimard',:]

In [None]:
new_books.at[209538 ,'Publisher'] = 'DK Publishing Inc'
new_books.at[209538 ,'Year-Of-Publication'] = 2000
new_books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
new_books.at[209538 ,'Book-Author'] = 'Michael Teitelbaum'

new_books.at[221678 ,'Publisher'] = 'DK Publishing Inc'
new_books.at[221678 ,'Year-Of-Publication'] = 2000
new_books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
new_books.at[209538 ,'Book-Author'] = 'James Buckley'

new_books.at[220731 ,'Publisher'] = 'Gallimard'
new_books.at[220731 ,'Year-Of-Publication'] = '2003'
new_books.at[209538 ,'Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
new_books.at[209538 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [None]:
#Correcting the dtypes of yearOfPublication
new_books['Year-Of-Publication'] =new_books['Year-Of-Publication'].astype(int)

In [None]:
#Now it can be seen that yearOfPublication has all values as integers

print(sorted(list(new_books['Year-Of-Publication'].unique())))

In [None]:
## Replacing Invalid years with max year
count = Counter(new_books['Year-Of-Publication'])
[k for k, v in count.items() if v == max(count.values())]

In [None]:
new_books.loc[new_books['Year-Of-Publication'] > 2021, 'Year-Of-Publication'] = 2002
new_books.loc[new_books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = 2002

In [None]:
## Uppercasing all alphabets in ISBN
new_books['ISBN'] = new_books['ISBN'].str.upper()

In [None]:
## Drop duplicate rows
new_books.drop_duplicates(keep='last', inplace=True) 
new_books.reset_index(drop = True, inplace = True)

In [None]:
new_books.info()

In [None]:
new_books.head()


### User Dataset Processing

In [None]:
## Checking the columns for User Dataset
print("Columns: ", list(users.columns))
users.head()

In [None]:
users.dtypes

In [None]:
## Checking null values
print(users.isna().sum())

In [None]:
## Check for all values present in Age column
#Age column has some invalid entries like nan, 0 and very high values like 100 and above
print(sorted(list(users['Age'].unique())))

In [None]:

required = users[users['Age'] <= 90]
required = required[required['Age'] >= 5]

In [None]:

mean = round(required['Age'].mean())   
mean

In [None]:
users.loc[users['Age'] > 80, 'Age'] = mean    #outliers with age grater than 80 are substituted with mean 
users.loc[users['Age'] < 10, 'Age'] = mean    #outliers with age less than 10 years are substitued with mean
users['Age'] = users['Age'].fillna(mean)      #filling null values with mean
users['Age'] = users['Age'].astype(int)       #changing Datatype to int


In [None]:
#rechecking
print(sorted(users.Age.unique()))


In [None]:
list_ = users.Location.str.split(', ')

city = []
state = []
country = []
count_no_state = 0    
count_no_country = 0

for i in range(0,len(list_)):
    if list_[i][0] == ' ' or list_[i][0] == '' or list_[i][0]=='n/a' or list_[i][0] == ',':  #removing invalid entries too
        city.append('other')
    else:
        city.append(list_[i][0].lower())

    if(len(list_[i])<2):
        state.append('other')
        country.append('other')
        count_no_state += 1
        count_no_country += 1
    else:
        if list_[i][1] == ' ' or list_[i][1] == '' or list_[i][1]=='n/a' or list_[i][1] == ',':   #removing invalid entries 
            state.append('other')
            count_no_state += 1            
        else:
            state.append(list_[i][1].lower())
        
        if(len(list_[i])<3):
            country.append('other')
            count_no_country += 1
        else:
            if list_[i][2] == ''or list_[i][1] == ',' or list_[i][2] == ' ' or list_[i][2] == 'n/a':
                country.append('other')
                count_no_country += 1
            else:
                country.append(list_[i][2].lower())
        
users = users.drop('Location',axis=1)

temp = []
for ent in city:
    c = ent.split('/')            #handling cases where city/state entries from city list as state is already given 
    temp.append(c[0])

df_city = pd.DataFrame(temp,columns=['City'])
df_state = pd.DataFrame(state,columns=['State'])
df_country = pd.DataFrame(country,columns=['Country'])

users = pd.concat([users, df_city], axis=1)
users = pd.concat([users, df_state], axis=1)
users = pd.concat([users, df_country], axis=1)

print(count_no_country)   #printing the number of countries didnt have any values 
print(count_no_state)     #printing the states which didnt have any values

In [None]:
## Drop duplicate rows
users.drop_duplicates(keep='last', inplace=True)
users.reset_index(drop=True, inplace=True)


In [None]:
users.info()

In [None]:
users.head()

In [None]:
users

### Book-Ratings Dataset Processing

In [None]:
ratings

In [None]:
## Checking the columns for Book-Ratings Dataset
print("Columns: ", list(ratings.columns))
ratings.head()

In [None]:
## Checking for null values
ratings.isnull().sum() 

In [None]:
## checking all ratings number or not
print(is_numeric_dtype(ratings['Book-Rating']))

In [None]:
## checking User-ID contains only number or not
print(is_numeric_dtype(ratings['User-ID']))

In [None]:
## checking ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)    
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

In [None]:
## removing extra characters from ISBN (from ratings dataset) existing in books dataset
bookISBN = new_books['ISBN'].tolist() 
reg = "[^A-Za-z0-9]" 
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])    
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [None]:
## Uppercasing all alphabets in ISBN
ratings['ISBN'] = ratings['ISBN'].str.upper()

In [None]:
## Drop duplicate rows
ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [None]:
ratings.info()

In [None]:
ratings.head()

In [None]:
ratings

### Merging all the Tables Together

### Merging Books, Users, Ratings table 

In [None]:
dataset = pd.merge(new_books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

### Divide complete data on the basis of Implicit and Explicit ratings datasets

In [None]:
## Explicit Ratings Dataset
dataset1 = dataset[dataset['Book-Rating'] != 0]
dataset1 = dataset1.reset_index(drop = True)
dataset1.shape

In [None]:
## Implicit Ratings Dataset
dataset2 = dataset[dataset['Book-Rating'] == 0]
dataset2 = dataset2.reset_index(drop = True)
dataset2.shape

In [None]:
dataset1.head()

In [None]:
dataset1

### Data Visualization

In [None]:
publications = {}
for year in new_books['Year-Of-Publication']:
    if str(year) not in publications:
        publications[str(year)] = 0
    publications[str(year)] +=1

publications = {k:v for k, v in sorted(publications.items())}

fig = plt.figure(figsize =(55, 15))
plt.bar(list(publications.keys()),list(publications.values()), color = 'purple')
plt.ylabel("Number of books published")
plt.xlabel("Year of Publication")
plt.title("Number of books published yearly")
plt.margins(x = 0)
plt.show()

In [None]:
!pip3 install seaborn

In [None]:
sns.color_palette()

In [None]:
## Sorting the order in Decending order

plt.figure(figsize=(15,6))
sns.countplot(y="Book-Author", data=new_books,order=new_books['Book-Author'].value_counts().index[0:15],palette= 'CMRmap')
plt.title("No of books by each author (Top 15)")

In [None]:
sns.pairplot(ratings)

In [None]:
sns.pairplot(new_books)

In [None]:
sns.pairplot(users)

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x="Publisher", data=new_books,order=new_books['Publisher'].value_counts().index[0:15])
plt.title("No of books published each publisher (Top 15)")

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(y="Book-Rating", data=ratings, palette= 'dark')

In [None]:
## Explicit Ratings
plt.figure(figsize=(8,6))
data = ratings[ratings['Book-Rating'] != 0]
sns.countplot(x="Book-Rating", data=data, palette= 'winter_r')
plt.title("Explicit Ratings")

In [None]:
plt.figure(figsize=(8,6))
users.Age.hist(bins=[10*i for i in range(1, 10)])     
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()



In [None]:
plt.figure(figsize=(20,6))
sns.countplot(x="City", data=users,order=users['City'].value_counts().index[0:15],palette= 'vlag')
plt.title("No of readers from each city (Top 15)")

In [None]:
plt.figure(figsize=(20,6))
sns.countplot(y="State", data=users,order=users['State'].value_counts().index[0:15],palette= 'terrain_r')
plt.title("No of readers from each state (Top 15)")

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x="Country", data=users, order=users['Country'].value_counts().index[0:10],palette= 'dark')
plt.title("No of readers from each country (Top 10)")

In [None]:
data=users[users['Country']=='usa']
plt.figure(figsize=(20,6))
sns.countplot(x="State", data=data,order=data['State'].value_counts().index[0:15],palette= 'tab20c')
plt.title("No of readers from states of USA (Top 15)")

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(y="Book-Title", data=dataset, order=dataset['Book-Title'].value_counts().index[0:15],palette= 'spring_r')
plt.title("Number of Ratings for a book (Top 15)")

### Book Recommondation System

In [None]:
bookName = input("Enter a book name: ")
number = int(input("Enter number of books to recommend: "))



### Popular Books in the Collection

In [None]:
def popularity_based(dataframe, n):
    if n >= 1 and n <= len(dataframe):
        data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False).head(n)
        result = pd.merge(data, new_books, on='ISBN', left_index = False)
        return result
    return "Invalid number of books entered!!"

In [None]:
print("Top", number, "Popular books are: ")
popularity_based(dataset1, number)

### Popular Books in the given place

In [None]:
def search_unique_places(dataframe, place):
    place = place.lower()

    if place in list(dataframe['City'].unique()):
        return dataframe[dataframe['City'] == place]
    elif place in list(dataframe['State'].unique()):
        return dataframe[dataframe['State'] == place]
    elif place in list(dataframe['Country'].unique()):
        return dataframe[dataframe['Country'] == place]
    else:
        return "Invalid Entry"

In [None]:
place = input("Enter the name of place: ")
data = search_unique_places(dataset1, place)

if isinstance(data, pd.DataFrame):
    data = popularity_based(data, number)

data

### Books by same author, publisher of given book name

In [None]:
def printBook(k, n):
    z = k['Book-Title'].unique()
    for x in range(len(z)):
        print(z[x])
        if x >= n-1:
            break

In [None]:
def get_books(dataframe, name, n):
    print("\nBooks by same Author:\n")
    au = dataframe['Book-Author'].unique()

    data = dataset2[dataset2['Book-Title'] != name]

    if au[0] in list(data['Book-Author'].unique()):
        k2 = data[data['Book-Author'] == au[0]]
    k2 = k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

    print("\n\nBooks by same Publisher:\n")
    au = dataframe['Publisher'].unique()

    if au[0] in list(data['Publisher'].unique()):
        k2 = pd.DataFrame(data[data['Publisher'] == au[0]])
    k2=k2.sort_values(by=['Book-Rating']) 
    printBook(k2, n)

In [None]:
if bookName in list(dataset1['Book-Title'].unique()):
    d = dataset2[dataset2['Book-Title'] == bookName]
    get_books(d, bookName, number)
else:
    print("Invalid Book Name!!")

### Books Popular by Year

In [None]:
data = pd.DataFrame(dataset1.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False)
data = pd.merge(data, new_books, on='ISBN', left_index = False)

years = set()
indices = []
for ind, row in data.iterrows():
    if row['Year-Of-Publication'] in years:
        indices.append(ind)
    else:
        years.add(row['Year-Of-Publication'])

data = data.drop(indices)
data = data.drop('Book-Rating', axis = 1)
data = data.sort_values('Year-Of-Publication', ascending=False)

pd.set_option("display.max_rows", None, "display.max_columns", None)
data

###  Collaborative Filtering 


### Selecting books with total ratings equals to or more than 50

In [None]:
df = pd.DataFrame(dataset1['Book-Title'].value_counts())
df['Total-Ratings'] = df['Book-Title']
df['Book-Title'] = df.index
df.reset_index(level=0, inplace=True)
df = df.drop('index',axis=1)

df = dataset1.merge(df, left_on = 'Book-Title', right_on = 'Book-Title', how = 'left')
df = df.drop(['Year-Of-Publication','Publisher','Age','City','State','Country'], axis=1)

popularity_threshold = 50
popular_book = df[df['Total-Ratings'] >= popularity_threshold]
popular_book = popular_book.reset_index(drop = True)

### User - Item Collaborative Filtering

In [None]:
testdf = pd.DataFrame()
testdf['ISBN'] = popular_book['ISBN']
testdf['Book-Rating'] = popular_book['Book-Rating']
testdf['User-ID'] = popular_book['User-ID']
testdf = testdf[['User-ID','Book-Rating']].groupby(testdf['ISBN'])

In [None]:
listOfDictonaries=[]
indexMap = {}
reverseIndexMap = {}
ptr=0

for groupKey in testdf.groups.keys():
    tempDict={}
    groupDF = testdf.get_group(groupKey)
    for i in range(0,len(groupDF)):
        tempDict[groupDF.iloc[i,0]] = groupDF.iloc[i,1]
    indexMap[ptr]=groupKey
    reverseIndexMap[groupKey] = ptr
    ptr=ptr+1
    listOfDictonaries.append(tempDict)

dictVectorizer = DictVectorizer(sparse=True)
vector = dictVectorizer.fit_transform(listOfDictonaries)
pairwiseSimilarity = cosine_similarity(vector)

In [None]:
def printBookDetails(bookID):
    print(dataset1[dataset1['ISBN']==bookID]['Book-Title'].values[0])
    """
    print("Title:", dataset1[dataset1['ISBN']==bookID]['Book-Title'].values[0])
    print("Author:",dataset1[dataset['ISBN']==bookID]['Book-Author'].values[0])
    #print("Printing Book-ID:",bookID)
    print("\n")
    """

def getTopRecommandations(bookID):
    collaborative = []
    row = reverseIndexMap[bookID]
    print("Input Book:")
    printBookDetails(bookID)
    
    print("\nRECOMMENDATIONS:\n")
    
    mn = 0
    similar = []
    for i in np.argsort(pairwiseSimilarity[row])[:-2][::-1]:
          if dataset1[dataset1['ISBN']==indexMap[i]]['Book-Title'].values[0] not in similar:
                if mn>=number:
                      break
                mn+=1
                similar.append(dataset1[dataset1['ISBN']==indexMap[i]]['Book-Title'].values[0])
                printBookDetails(indexMap[i])
                collaborative.append(dataset1[dataset1['ISBN']==indexMap[i]]['Book-Title'].values[0])
    return collaborative

In [None]:
k = list(dataset1['Book-Title'])
m = list(dataset1['ISBN'])

collaborative = getTopRecommandations(m[k.index(bookName)])

In [None]:
popularity_threshold = 50

user_count = dataset1['User-ID'].value_counts()
data = dataset1[dataset1['User-ID'].isin(user_count[user_count >= popularity_threshold].index)]
rat_count = data['Book-Rating'].value_counts()
data = data[data['Book-Rating'].isin(rat_count[rat_count >= popularity_threshold].index)]

matrix = data.pivot_table(index='User-ID', columns='ISBN', values = 'Book-Rating').fillna(0)

In [None]:
average_rating = pd.DataFrame(dataset1.groupby('ISBN')['Book-Rating'].mean())
average_rating['ratingCount'] = pd.DataFrame(ratings.groupby('ISBN')['Book-Rating'].count())
average_rating.sort_values('ratingCount', ascending=False).head()

In [None]:
isbn = new_books.loc[new_books['Book-Title'] == bookName].reset_index(drop = True).iloc[0]['ISBN']
row = matrix[isbn]
correlation = pd.DataFrame(matrix.corrwith(row), columns = ['Pearson Corr'])
corr = correlation.join(average_rating['ratingCount'])

res = corr.sort_values('Pearson Corr', ascending=False).head(number+1)[1:].index
corr_books = pd.merge(pd.DataFrame(res, columns = ['ISBN']), new_books, on='ISBN')
print("\n Recommended Books: \n")
corr_books

##  Nearest Neighbours Based

In [None]:
data = (dataset1.groupby(by = ['Book-Title'])['Book-Rating'].count().reset_index().
        rename(columns = {'Book-Rating': 'Total-Rating'})[['Book-Title', 'Total-Rating']])

result = pd.merge(data, dataset1, on='Book-Title', left_index = False)
result = result[result['Total-Rating'] >= popularity_threshold]
result = result.reset_index(drop = True)

matrix = result.pivot_table(index = 'Book-Title', columns = 'User-ID', values = 'Book-Rating').fillna(0)
up_matrix = csr_matrix(matrix)

In [None]:
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(up_matrix)

distances, indices = model.kneighbors(matrix.loc[bookName].values.reshape(1, -1), n_neighbors = number+1)
print("\nRecommended books:\n")
for i in range(0, len(distances.flatten())):
    if i > 0:
        print(matrix.index[indices.flatten()[i]]) 

## Content Based

In [None]:
popularity_threshold = 80
popular_book = df[df['Total-Ratings'] >= popularity_threshold]
popular_book = popular_book.reset_index(drop = True)
popular_book.shape

In [None]:
tf = TfidfVectorizer(ngram_range=(1, 2), min_df = 1, stop_words='english')
tfidf_matrix = tf.fit_transform(popular_book['Book-Title'])
tfidf_matrix.shape

In [None]:
normalized_df = tfidf_matrix.astype(np.float32)
cosine_similarities = cosine_similarity(normalized_df, normalized_df)
cosine_similarities.shape

In [None]:
print("Recommended Books:\n")
isbn = new_books.loc[new_books['Book-Title'] == bookName].reset_index(drop = True).iloc[0]['ISBN']
content = []

idx = popular_book.index[popular_book['ISBN'] == isbn].tolist()[0]
similar_indices = cosine_similarities[idx].argsort()[::-1]
similar_items = []
for i in similar_indices:
    if popular_book['Book-Title'][i] != bookName and popular_book['Book-Title'][i] not in similar_items and len(similar_items) < number:
        similar_items.append(popular_book['Book-Title'][i])
        content.append(popular_book['Book-Title'][i])

for book in similar_items:
    print(book)

## Hybrid Approach (Content+Collaborative) 

In [None]:
z = list()
k = float(1/number)
for x in range(number):
      z.append(1-k*x)

dictISBN = {}
for x in collaborative:
      dictISBN[x] = z[collaborative.index(x)]

for x in content:
    if x not in dictISBN:
        dictISBN[x] = z[content.index(x)]
    else:
        dictISBN[x] += z[content.index(x)]

ISBN = dict(sorted(dictISBN.items(),key=operator.itemgetter(1),reverse=True))
w=0
print("Input Book:\n")
print(bookName)
print("\nRecommended Books:\n")
for x in ISBN.keys():
    if w>=number:
        break
    w+=1
    print(x)