## Introduction
Greetings from the Kaggle bot! This is an automatically-generated kernel with starter code demonstrating how to read in the data and begin exploring. Click the blue "Edit Notebook" or "Fork Notebook" button at the top of this kernel to begin editing.

## Exploratory Analysis
To begin this exploratory analysis, first use `matplotlib` to import libraries and define functions for plotting the data. Depending on the data, not all plots will be made. (Hey, I'm just a kerneling bot, not a Kaggle Competitions Grandmaster!)

In [None]:
import numpy as np 
import pandas as pd
import os
import seaborn as sns
#import isbnlib
#from newspaper import Article
import matplotlib.pyplot as plt
plt.style.use('ggplot')
#from tqdm import tqdm
import re
from scipy.cluster.vq import kmeans, vq
from pylab import plot, show
from matplotlib.lines import Line2D
import matplotlib.colors as mcolors
#import goodreads_api_client as gr
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D

In [None]:
df1 = pd.read_csv('../input/books.csv', error_bad_lines = False)

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


Now you're ready to read in the data and use the plotting functions to visualize the data.

### Let's check 1st file: ../input/books.csv

In [None]:
df1 = pd.read_csv('../input/books.csv', error_bad_lines = False)
df1.dataframeName = 'books.csv'
df1.index = df1['bookID']
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df1.head(5)

In [None]:
df1.replace(to_replace='J.K. Rowling/Mary GrandPré', value = 'J.K. Rowling', inplace=True)

In [None]:
df1.head(5)

#### Columns Description: 

- **bookID** Contains the unique ID for each book/series
- **title** contains the titles of the books
- **authors** contains the author of the particular book
- **average_rating** the average rating of the books, as decided by the users
- **ISBN** ISBN(10) number, tells the information about a book - such as edition and publisher
- **ISBN 13** The new format for ISBN, implemented in 2007. 13 digits
- **language_code** Tells the language for the books
- **Num_pages** Contains the number of pages for the book
- **Ratings_count** Contains the number of ratings given for the book
- **text_reviews_count** Has the count of reviews left by users

# Exploratory Data Analysis<a id="3"></a> <br>

In [None]:
sns.set_context('poster')
plt.figure(figsize=(50,15))
books = df1['title'].value_counts()[:30]
rating = df1.average_rating[:30]
sns.barplot(x = books, y = books.index, palette='rocket')
plt.title("Most Occurring Books")
plt.xlabel("Number of occurances")
plt.ylabel("Books")
plt.show()

Looks like older books seem to show up more often. 

Correlation matrix:

In [None]:
plotCorrelationMatrix(df1, 8)

Scatter and density plots:

In [None]:
plotScatterMatrix(df1, 10, 20)

In [None]:
most_rated = df1.sort_values('ratings_count', ascending = False).head(45).set_index('title')
plt.figure(figsize=(20,20))
sns.barplot(most_rated['ratings_count'], most_rated.index, palette='rocket')


* *This data is intersting because it show that typically the top rated books are part of a series.
* *Whats even more intersting is that while series dominate the top ratings count,there are also a few instances where only the first book in the  series was enjoyed and after that the rest of the series tanks, so basically a one hit wonder


In [None]:
most_books = df1.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(10).set_index('authors')
most_books.head(10)

In [None]:
#Fid top aurthors
sns.set_context('talk')
most_books = df1.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(10).set_index('authors')
plt.figure(figsize=(15,10))
ax = sns.barplot(most_books['title'], most_books.index, palette='icefire_r')
ax.set_title("Top 10 authors with most books")
ax.set_xlabel("Total number of books")
for i in ax.patches:
    ax.text(i.get_width()+.3, i.get_y()+0.5, str(round(i.get_width())), fontsize = 10, color = 'k')

In [None]:
high_rated_author = df1[df1['average_rating']>=4.3]
high_rated_author = high_rated_author.groupby('authors')['title'].count().reset_index().sort_values('title', ascending = False).head(10).set_index('authors')
plt.figure(figsize=(15,10))
ax = sns.barplot(high_rated_author['title'], high_rated_author.index, palette='Set2')
ax.set_xlabel("Number of Books")
ax.set_ylabel("Authors")
for i in ax.patches:
    ax.text(i.get_width()+.3, i.get_y()+0.5, str(round(i.get_width())), fontsize = 10, color = 'k')

In [None]:
def segregation(data):
    values = []
    for val in data.average_rating:
        if val>=0 and val<=1:
            values.append("Between 0 and 1")
        elif val>1 and val<=2:
            values.append("Between 1 and 2")
        elif val>2 and val<=3:
            values.append("Between 2 and 3")
        elif val>3 and val<=4:
            values.append("Between 3 and 4")
        elif val>4 and val<=5:
            values.append("Between 4 and 5")
        else:
            values.append("NaN")
    print(len(values))
    return values

In [None]:
df1.average_rating.isnull().value_counts()

In [None]:
df1.dropna(0, inplace=True)
#Removing Any null values

In [None]:
plt.figure(figsize=(10,10))
rating= df1.average_rating.astype(float)
sns.distplot(rating, bins=20)


From the given plot, we can infer that: 

- Majority of the ratings lie near 3.7-4.3, approximately.
- Books having scores near 5 are extremely rare

In [None]:
#Checking for any relation between ratings and review counts
plt.figure(figsize=(15,10))
df1.dropna(0, inplace=True)
sns.set_context('paper')
ax =sns.jointplot(x="average_rating",y='text_reviews_count', kind='scatter',  data= df1[['text_reviews_count', 'average_rating']])
ax.set_axis_labels("Average Rating", "Text Review Count")
plt.show()

Analysis: We can infer from the plot that most of the ratings for the books seems to lie near 3-4, with a heavy amount of reviews lying barely near 5000, approximately. Let's plot that.

In [None]:
trial = df1[~(df1['text_reviews_count']>5000)]
trial.head(5)

In [None]:
#Checking for any relation between them.
plt.figure(figsize=(15,10))
df1.dropna(0, inplace=True)
sns.set_context('paper')
ax =sns.jointplot(x="average_rating",y='text_reviews_count', kind='scatter',  data= trial, color = 'green')
ax.set_axis_labels("Average Rating", "Text Review Count")
plt.show()

Shows majority of ratings are still where text-review count is under 1k

In [None]:
#Checking relationship between numberofpages and ratings
plt.figure(figsize=(15,10))
sns.set_context('paper')
ax = sns.jointplot(x="average_rating", y="  num_pages", data = df1, color = 'crimson')
ax.set_axis_labels("Average Rating", "Number of Pages")

Not much info here need to take a closer look and only look where num_pages is less than 1k

In [None]:
trial = df1[~(df1['  num_pages']>1000)]
trial.head(5)

In [None]:
ax = sns.jointplot(x="average_rating", y="  num_pages", data = trial, color = 'darkcyan')
ax.set_axis_labels("Average Rating", "Number of Pages")

Seems the best and worst books are found in the 150-400 range of page numbers.

In [None]:
#check relationship between ratings and ratings count
sns.set_context('paper')
ax = sns.jointplot(x="average_rating", y="ratings_count", data = df1, color = 'orange')
ax.set_axis_labels("Average Rating", "Ratings Count")

In [None]:
#Get rid of outliers and lets look closer where ratings cound is less than 20,000,000
trial = df1[~(df1.ratings_count>2000000)]
trial.head(5)

In [None]:
sns.set_context('paper')
ax = sns.jointplot(x="average_rating", y="ratings_count", data = trial, color = 'brown')
ax.set_axis_labels("Average Rating", "Ratings Count")

From the graph, we can see that there can be a potential relationship between the average rating and ratings count. As the number of ratings increase, the rating for the book seems to taper towards 4. The average rating seems to become sparse while the number keeps on decreasing. We can also notice that a 5 star rating seems to only happen with a small ratings count

# K Means Clustering

* KMeans clustering is a type of unsupervised learning which groups unlabelled data. The goal is to find groups in data.

* Here I want to find natural clusters between the rating count and average rating value.








In [None]:
trial = df1[['average_rating', 'ratings_count','  num_pages','text_reviews_count']]
data = np.asarray([np.asarray(trial['average_rating']), np.asarray(trial['ratings_count'])]).T
trial.head(5)


# Determine how many clusters K 
* Find K using scree plot/elbow curve method

In [None]:
X = data
distortions = []
for k in range(2,20):
    k_means = KMeans(n_clusters = k)
    k_means.fit(X)
    distortions.append(k_means.inertia_)

fig = plt.figure(figsize=(15,10))
plt.plot(range(2,20), distortions, 'bx-')
plt.title("Elbow Curve")

This shows that elbow is around 5-7 so we can start with 5 clusters

In [None]:
#Computing K means with K = 5, thus, taking it as 5 clusters
centroids, _ = kmeans(data, 5)

#assigning each sample to a cluster
#Vector Quantisation:

idx, _ = vq(data, centroids)
idx

In [None]:
# some plotting using numpy's logical indexing
sns.set_context('paper')
plt.figure(figsize=(15,10))
plt.plot(data[idx==0,0],data[idx==0,1],'or',#red circles
     data[idx==1,0],data[idx==1,1],'ob',#blue circles
     data[idx==2,0],data[idx==2,1],'oy', #yellow circles
     data[idx==3,0],data[idx==3,1],'om', #magenta circles
     data[idx==4,0],data[idx==4,1],'ok',#black circles
    
     
        
        
        
        
        )
plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=8, )




circle1 = Line2D(range(1), range(1), color = 'red', linewidth = 0, marker= 'o', markerfacecolor='red')
circle2 = Line2D(range(1), range(1), color = 'blue', linewidth = 0,marker= 'o', markerfacecolor='blue')
circle3 = Line2D(range(1), range(1), color = 'yellow',linewidth=0,  marker= 'o', markerfacecolor='yellow')
circle4 = Line2D(range(1), range(1), color = 'magenta', linewidth=0,marker= 'o', markerfacecolor='magenta')
circle5 = Line2D(range(1), range(1), color = 'black', linewidth = 0,marker= 'o', markerfacecolor='black')

plt.legend((circle1, circle2, circle3, circle4, circle5)
           , ('Cluster 1','Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5'), numpoints = 1, loc = 0, )


plt.show()

* Remove some outliers to get a more accurate model

In [None]:
#find the outliers
trial.idxmax()


In [None]:
trial.drop(2034, inplace = True)
trial.drop(41865, inplace = True)

In [None]:
data = np.asarray([np.asarray(trial['average_rating']), np.asarray(trial['ratings_count'])]).T


In [None]:
centroids, _ = kmeans(data, 5)

#assigning each sample to a cluster
#Vector Quantisation:

idx, _ = vq(data, centroids)
idx

In [None]:
# some plotting using numpy's logical indexing
sns.set_context('paper')
plt.figure(figsize=(15,10))
plt.plot(data[idx==0,0],data[idx==0,1],'or',#red circles
     data[idx==1,0],data[idx==1,1],'ob',#blue circles
     data[idx==2,0],data[idx==2,1],'oy', #yellow circles
     data[idx==3,0],data[idx==3,1],'om', #magenta circles
     data[idx==4,0],data[idx==4,1],'ok',#black circles
    
     
        
        
        
        
        )
plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=8, )




circle1 = Line2D(range(1), range(1), color = 'red', linewidth = 0, marker= 'o', markerfacecolor='red')
circle2 = Line2D(range(1), range(1), color = 'blue', linewidth = 0,marker= 'o', markerfacecolor='blue')
circle3 = Line2D(range(1), range(1), color = 'yellow',linewidth=0,  marker= 'o', markerfacecolor='yellow')
circle4 = Line2D(range(1), range(1), color = 'magenta', linewidth=0,marker= 'o', markerfacecolor='magenta')
circle5 = Line2D(range(1), range(1), color = 'black', linewidth = 0,marker= 'o', markerfacecolor='black')

plt.legend((circle1, circle2, circle3, circle4, circle5)
           , ('Cluster 1','Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5'), numpoints = 1, loc = 0, )


plt.show()

From the above plot, now we can see that once the whole system _can_ be classified into clusters. As the count increases, the rating would end up near the cluster given above. The green squares are the centroids for the given clusters. 

As the rating count seems to decrease, the average rating seems to become sparser, with higher volatility and less accuracy. 

# Making Reccomendations

Having seen the clustering, we can infer that there can be some recommendations which can happen with the relation between Average Rating and Ratings Count. 

We can create a reccomendation algorithm using K Nearest Neighbors using the ratings distribution

Based on a book entered by the user, the nearest neighbours to it would be classified as the books which the user might like. 


One of the first things that need to happen int he creation of a books feature table, this table is simply a binning of the books ratings such as
- Between 0 and 1
- Between 1 and 2
- Between 2 and 3
- Between 3 and 4
- Between 4 and 5

The recommendations then consider the average ratings and ratings count for the query entered.

In [None]:
df1['Ratings_Dist'] = segregation(df1)
df1.head()

In [None]:
#Convert the categorical ratings distribution to indicator variables
books_features = pd.concat([df1['Ratings_Dist'].str.get_dummies(sep=","), df1['average_rating'], df1['ratings_count']], axis=1)


In [None]:
books_features.head()

The min-max scaler is used to reduce the bias which would have been present due to some books having a massive amount of features, yet the rest having less. Min-Max scaler would find the median for them all and equalize it.

In [None]:
min_max_scaler = MinMaxScaler()
books_features = min_max_scaler.fit_transform(books_features)


In [None]:
np.round(books_features,2)

In [None]:
model = neighbors.NearestNeighbors(n_neighbors=7, algorithm='auto')
model.fit(books_features)
distance, indices = model.kneighbors(books_features)


Creating specific functions to help in finding the book names: 
- Get index from Title
- Get ID from partial name
- Print the similar books from the feature dataset. 

In [None]:
def get_index_from_name(name):
    return df1[df1["title"]==name].index.tolist()[0]

all_books_names = list(df1.title.values)

def get_id_from_partial_name(partial):
    for name in all_books_names:
        if partial in name:
            print(name,all_books_names.index(name))
            
def print_similar_books(query=None,id=None):
    if id:
          for idx,id in enumerate(indices[id][1:]):
            print(df1.iloc[id]["title"] ," [ID: {bid}] - [Distance: {dis}]".format(bid=df1.iloc[id]["bookID"],dis=distance[id][idx]))
    if query:
        found_id = get_index_from_name(query)
        for idx,id in enumerate(indices[found_id][1:]):
            print(df1.iloc[id]["title"] ," [ID: {bid}] - [Distance: {dis}]".format(bid=df1.iloc[id]["bookID"],dis=distance[id][idx]))
            
# This will return a user score based off a rating the user gave the book and the distance of how closely related the books are to one another, the smaller the user score the moore liekly the user will enjoy the book
def get_similar_userRated_books(query=None,id=None,user_rating=1):
    userData = []
    if id:
        for idx,id in enumerate(indices[id][1:]):
            bookID = df1.iloc[id]["bookID"]
            userScore = distance[id][idx] / user_rating
            data  = (bookID,userScore)
            print(df1.iloc[id]["title"] ," [ID: {bid}] - [Distance: {dis}]".format(bid=df1.iloc[id]["bookID"],dis=distance[id][idx]))
            print("id: {}  -  score: {}".format(bookID,userScore))            
            userData.append(data)            
    if query:
        found_id = get_index_from_name(query)
        for idx,id in enumerate(indices[found_id][1:]):
            bookID = df1.iloc[id]["bookID"]
            userScore = distance[id][idx] / user_rating
            print(df1.iloc[id]["title"] ," [ID: {bid}] - [Distance: {dis}]".format(bid=df1.iloc[id]["bookID"],dis=distance[id][idx]))
            print("id: {}  -  score: {}".format(bookID,userScore))            
            userData.append(data)
    print(userData)
    return userData
            
def get_similar_userRated_books_from_list(userRateData):
    userData = []
    for (bookID,score) in userRateData:
        for idx,id in enumerate(indices[bookID][1:]):
            bookID = df1.iloc[id]["bookID"]
            userScore = distance[id][idx] / score            
            data  = (bookID,userScore)            
            userData.append(data)      
            sortedData = sorted(userData,key=lambda tup:tup[1])
    get_books_from_userScore(sortedData)

def get_books_from_userScore(userData):    
    data = []
    for (bookID,score) in userData:
        print(bookID,":",df1.loc[bookID].title," - ",score)
        data.append((bookID,df1.loc[bookID].title,score))
        # return the list of tuples containing id,title,and score all already ordered
    return data
        

In [None]:
print_similar_books("Harry Potter and the Half-Blood Prince (Harry Potter  #6)")


In [None]:
get_similar_userRated_books(id=1,user_rating=3.5)

In [None]:
print_similar_books(id=5107)

In [None]:
get_id_from_partial_name("Percy J")

# Sample Method Calls for API

In [None]:
print_similar_books("Harry Potter and the Half-Blood Prince (Harry Potter  #6)")

In [None]:
#mock user bookid,score tuple, this will be all we need to get the overall reccommendation list for the user
data = [(3,5),(2,3.5),(753,2),(87,4.2)]
userData =get_similar_userRated_books_from_list(data)