In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
    

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
#csv_reading
book_ratings = pd.read_csv("/kaggle/input/books-recommendation-system/BX-Book-Ratings.csv", sep=';', error_bad_lines=False, encoding="latin-1")
book_ratings.columns = ['userID', 'ISBN', 'bookRating']
book_ratings.head()

In [10]:
books = pd.read_csv("/kaggle/input/books-recommendation-system/BX-Books.csv", sep=';', error_bad_lines=False, encoding="latin-1", low_memory=False)
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
books.head()

In [11]:
users = pd.read_csv("/kaggle/input/books-recommendation-system/BX-Users.csv", sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
users.head()

In [12]:
#visualisation with pyplot
plt.rc("font", size=15)
book_ratings.bookRating.value_counts(sort=False).plot(kind='bar')
plt.title('rating distribution\n')
plt.xlabel('rating')
plt.ylabel('count')
plt.show()

In [13]:
print(books.shape)
print(list(books.columns))

In [14]:
print(users.shape)
print(list(users.columns))

In [15]:
users.Age.hist(bins=[0, 10, 20, 30, 40, 50, 100])
plt.title('Age Distribution\n')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [17]:
counts1 = book_ratings['userID'].value_counts()
book_ratings = book_ratings[book_ratings['userID'].isin(counts1[counts1 >= 200].index)]
counts = book_ratings['bookRating'].value_counts()
book_ratings = book_ratings[book_ratings['bookRating'].isin(counts[counts >= 100].index)]

In [18]:
#tables merge
combine_book_rating = pd.merge(book_ratings, books, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
print(combine_book_rating.head())

In [19]:
#drop not a number values
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])

#imdb rating
book_ratingCount = (combine_book_rating.
     groupby(by = ['bookTitle'])['bookRating'].
     count().
     reset_index().
     rename(columns = {'bookRating': 'totalRatingCount'})
     [['bookTitle', 'totalRatingCount']]
    )
print(book_ratingCount.head())

In [20]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
print(rating_with_totalRatingCount.head())

pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

In [21]:
C = rating_with_totalRatingCount['bookRating'].mean()
C

In [22]:
m = rating_with_totalRatingCount['totalRatingCount'].quantile(0.9)
m

In [23]:
qualify_books = rating_with_totalRatingCount.copy().loc[rating_with_totalRatingCount['totalRatingCount'] >= m]
qualify_books.shape

In [24]:
def weighted_rating(x, m=m, C=C):
    v = x['totalRatingCount']
    R = x['bookRating']
    return (v/(v+m)*R) + (m/(m+v)*C)

In [25]:
qualify_books['score'] = qualify_books.apply(weighted_rating,axis=1)
print(qualify_books.head())

In [26]:
qualify_books = qualify_books.sort_values('score', ascending=False)

In [27]:
print(qualify_books.head())

In [28]:
popularity_limit = 50
rating_popular_book = qualify_books.query('totalRatingCount >= @popularity_limit')
print(rating_popular_book.head())

In [29]:
combined = rating_popular_book.merge(users, left_on = 'userID', right_on = 'userID', how = 'left')
top_countries_user_rating = combined[combined['Location'].str.contains("mexico|germany|usa|russia|australia|portugal|italy|france|netherlands")]
top_countries_user_rating.drop('Age', axis=1)
print(top_countries_user_rating.head())

In [30]:
from scipy.sparse import csr_matrix
top_countries_user_rating = top_countries_user_rating.drop_duplicates(['userID', 'bookTitle'])
top_countries_user_rating_pivot = top_countries_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
top_countries_user_rating_matrix = csr_matrix(top_countries_user_rating_pivot.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(top_countries_user_rating_matrix)
print(model_knn)

In [31]:
query_index = np.random.choice(top_countries_user_rating_pivot.shape[0])
print(query_index)
print(top_countries_user_rating_pivot.iloc[query_index,:].values.reshape(1,-1))
distances, indices = model_knn.kneighbors(top_countries_user_rating_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
top_countries_user_rating_pivot.index[query_index]

In [32]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(top_countries_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, top_countries_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))