In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import neighbors
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from tqdm import tqdm
from progressbar import ProgressBar
import re
from scipy.cluster.vq import kmeans, vq
from pylab import plot, show
from matplotlib.lines import Line2D
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv('/kaggle/input/goodreadsbooks/books.csv', error_bad_lines = False)
dataset_for_recommendation = df.copy()
df[:50]

In [None]:
df.index = df['bookID']

In [None]:
df.shape

In [None]:
df.replace(to_replace='J.K. Rowling-Mary GrandPré', value = 'J.K. Rowling', inplace=True)

In [None]:
df.head()

In [None]:
sns.set_context('poster')
plt.figure(figsize=(20,15))
books = df['title'].value_counts()[:20]
rating = df.average_rating[:20]
sns.barplot(x = books, y = books.index, palette='deep')
plt.title("Most Occurring Books")
plt.xlabel("Number of occurances")
plt.ylabel("Books")
plt.show()

In [None]:
sns.set_context('paper')
plt.figure(figsize=(5, 10))
books = df['language_code'].value_counts()[:5]
sns.barplot(x=books, y=books.index, palette='deep')
plt.title("Most Common Language Code")
plt.xlabel("Language Code")
plt.ylabel("Number of Books")
plt.show()

In [None]:
most_rated = df.sort_values('ratings_count', ascending = False).head(10).set_index('title')
plt.figure(figsize=(15,10))
sns.barplot(most_rated['ratings_count'], most_rated.index, palette='deep')

Creating a copy of the original dataset. Then, creating a new column called 'rating_range' to map the range of average ratings. For example if the average rating is 3.6, the value of that row would be '3 and 4'

In [None]:
publisher = df.value_counts('publisher').head(10)
plt.figure(figsize=(15,10))
sns.barplot(y=publisher.index,x = publisher, palette='deep')

In [None]:
sns.distplot(df['  num_pages'])
plt.title('Distribution of Number of Pages in a book')

In [None]:
sns.set_context('talk')
most_books = df.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(10).set_index('authors')
plt.figure(figsize=(15,10))
ax = sns.barplot(most_books['title'], most_books.index, palette='deep')
ax.set_title("Top 10 authors with most books")
ax.set_xlabel("Total number of books")
for i in ax.patches:
    ax.text(i.get_width()+.3, i.get_y()+0.5, str(round(i.get_width())), fontsize = 10, color = 'k')

In [None]:
dataset_for_recommendation.loc[(dataset_for_recommendation["average_rating"] >= 0) & (dataset_for_recommendation['average_rating'] < 1), 'rating_range'] = "0-1" 
dataset_for_recommendation.loc[(dataset_for_recommendation["average_rating"] >= 1) & (dataset_for_recommendation['average_rating'] < 2), 'rating_range'] = "1-2"
dataset_for_recommendation.loc[(dataset_for_recommendation["average_rating"] >= 2) & (dataset_for_recommendation['average_rating'] < 3), 'rating_range'] = "2-3"
dataset_for_recommendation.loc[(dataset_for_recommendation["average_rating"] >= 3) & (dataset_for_recommendation['average_rating'] < 4), 'rating_range'] = "3-4"
dataset_for_recommendation.loc[(dataset_for_recommendation["average_rating"] >= 4) & (dataset_for_recommendation['average_rating'] <= 5), 'rating_range'] = "4-5"

Encode the rating_range column and the language_code column since both are categorical variables now

In [None]:
label_encoder = LabelEncoder()
encoded_rating = label_encoder.fit_transform(dataset_for_recommendation['rating_range'])
encoded_language = label_encoder.fit_transform(dataset_for_recommendation['language_code'])

Concatenating these two columns into the main dataset and performing MinMaxScaler so that the entire dataset is normalized between 0 to 1 for nearest neighbors

In [None]:
concatenated_dataset = pd.concat([pd.Series(encoded_rating), pd.Series(encoded_language), dataset_for_recommendation['average_rating'], dataset_for_recommendation['ratings_count']], axis = 1)
min_max_scaler = MinMaxScaler()
features = min_max_scaler.fit_transform(concatenated_dataset)

Getting 10 nearest neighbors(in this case, 10 most similar books) by using kd_tree as the algorithm for Nearest Neighbors. Then, getting the distance matrix and the indices with the kneighbors function. Since it is a recommendation engine, it is unsupervised learning

In [None]:
nn_model = neighbors.NearestNeighbors(n_neighbors=11, algorithm = 'kd_tree')
nn_model.fit(features)
distance_matrix, ids_to_list_matrix = nn_model.kneighbors(features)

Creating two functions to get book recommendations by name and by rating. We search for the exact book title and from the trained model, we fetch books with similar features. In case of ratings, we filter out books greater than or equal to the given rating

In [None]:
def book_recommendation_by_name(name):
    books = []
    idx = dataset_for_recommendation[dataset_for_recommendation['title'] == name].index
    idx = idx[0]
    for num in ids_to_list_matrix[idx]:
        books.append(dataset_for_recommendation.loc[num].title)
    return books

def book_recommendation_by_rating(rating):
    books = []
    idx = dataset_for_recommendation[dataset_for_recommendation['average_rating'] >= rating].index
    idx = idx[0]
    for num in ids_to_list_matrix[idx]:
        books.append(dataset_for_recommendation.loc[num].title)
    return books

def book_recommendation_by_author(author):
    books = []
    idx = dataset_for_recommendation[dataset_for_recommendation['authors'] == author].index
    idx = idx[0]
    for num in ids_to_list_matrix[idx]:
        books.append(dataset_for_recommendation.loc[num].title)
    return books

[1:] to ignore the input in the output list

In [None]:
book_recommendation_by_name("Harry Potter and the Order of the Phoenix (Harry Potter  #5)")[1:]

In [None]:
book_recommendation_by_name("The Control of Nature")[1:]

In [None]:
book_recommendation_by_rating(4.75)

In [None]:
book_recommendation_by_author('J.R.R. Tolkien')