In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

from scipy.sparse import find, csr_matrix
from pandas.api.types import CategoricalDtype

# Data Loading & Clearing

In [10]:
#Ideas

#How could user enter book: Enter part of title, then search in books dataframe with .title.str.contains(<input>), 
#then sort results by number of ratings to get the most rated book and hence the highest chance of good recommendations.

#Also, filter the outputs not to contain the same string which was given as input

In [11]:
ratings = pd.read_csv('BX-Book-Ratings.csv', sep = ';' , encoding='utf-8', on_bad_lines='warn', encoding_errors = 'ignore') #There are some suspicious ISBNs, we might loos them as they will not match with books on merging
ratings.rename(columns={'User-ID':'user'
                        , 'Book-Rating': 'rating'}, inplace=True)
ratings.shape
ratings.nunique()

(1149780, 3)

user      105283
ISBN      340553
rating        11
dtype: int64

In [12]:
# dtype_mapping = {
#     'Year-Of-Publication': float
# }
books = pd.read_csv('BX-Books.csv', sep = ';', encoding='utf-8', on_bad_lines='warn', encoding_errors = 'ignore')

#books.drop(books[books['Unnamed: 8'].notna()].index, axis = 0, inplace=True) #Dropping the shifted rows. There is 19 of them, not worth the manual correction I think.
#books.drop(['Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10'], axis = 1, inplace=True)
books.rename(columns={'Book-Title':'title'
                        , 'Book-Author': 'author'
                        , 'Year-Of-Publication': 'year'
                        , 'Publisher': 'publisher'}, inplace=True)
#books
books.nunique()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  exec(code_obj, self.user_global_ns, self.user_ns)


ISBN           271360
title          242135
author         102023
year              202
publisher       16807
Image-URL-S    271044
Image-URL-M    271044
Image-URL-L    271041
dtype: int64

In [13]:
users = pd.read_csv('BX-Users.csv', sep = ';' , encoding='utf-8', on_bad_lines='warn', encoding_errors = 'ignore')
users.rename(columns={'User-ID':'user'
                        , 'Location': 'location'
                        , 'Age': 'age'}, inplace=True)
print(users.shape)
users.nunique()

(278858, 3)


user        278858
location     57293
age            165
dtype: int64

In [14]:
ratings

Unnamed: 0,user,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [15]:
#how many ratings each book have?
explicit_rating_counts = ratings[ratings['rating'] > 0][['ISBN', 'rating']].groupby('ISBN', as_index=False).count().rename(columns={'rating':'explicit_ratings'})
implicit_rating_counts = ratings[ratings['rating'] == 0][['ISBN', 'rating']].groupby('ISBN', as_index=False).count().rename(columns={'rating':'implicit_ratings'})
#implicit_rating_counts

if 'explicit_ratings' not in books.columns:
    books = books.merge(explicit_rating_counts, on = 'ISBN', how='left')
    books = books.merge(implicit_rating_counts, on = 'ISBN', how='left')

In [18]:
#Clearing the years of publication
books.drop(books[books['year'] == 'Gallimard'].index, axis = 0, inplace=True)
books.drop(books[books['year'] == 'DK Publishing Inc'].index, axis = 0, inplace=True)
books['year'] = books['year'].astype(int, errors='ignore')
books.loc[books.year > 2023, 'year'] = 0 #setting impossible years to 0

#books.loc[books.year == 0, 'year'] = np.NAN ## <<------- this forces the column to be floats. So I will let the zeros stay for now, see how it will affect the rest.

In [19]:
books.year.unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984,    0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 1911,
       1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 1934, 1910, 1933,
       1902, 1924, 1921, 1900, 1944, 1917, 1901, 2010, 1908, 1906, 1935,
       1806, 2021, 2012, 2006, 1909, 2008, 1378, 1919, 1922, 1897, 1376])

In [20]:
#books['year'] = books['year'].astype(int, errors='ignore')
books['year'].median()

1995.0

In [21]:
#Adding a time period when the book was released, instead of a year
#The idea here is to add firslty the tens of the book release, and secondly the most adjecent tens as a secondary variable
#Hence e.g. for 1991 -> 1990 and 1980, and for 1998 -> 1990 and 2000
books['period1'] = (books.year // 10) * 10
books['period2'] = books['period1'] + ((((books.year - books.period1) >= 5)*2-1)*10)


In [51]:
##Inputs/argumets
# input_name = 'The Lord of the Rings' #Unclear input, when name of the series is given, instead of a name of a book
# input_name = 'The Fellowship of the Ring'
# input_name = 'The Eye of the World'
input_name = 'Great Expectations' 
# input_name = 'The Color of Magic'
# input_name = 'The Egyptian' #Here is a problem that title contains 'A Novel'. Giving more weight to author could help?
# input_name = 'Hotel New Hampshire'
# input_name = 'The Shining'
# input_name = 'For Whom the Bell Tolls'

input_book = (books[(books['title'].str.contains(input_name))
                .replace(np.NaN, False)]
                .sort_values(['explicit_ratings', 'implicit_ratings'], ascending = False).head(1)[['ISBN', 'title', 'author', 'year', 'publisher', 
                                                                                                   'explicit_ratings', 'implicit_ratings',
                                                                                                    'period1', 'period2']]
                )

# display(input_book)

input_index = input_book.index[0]
input_ISBN = input_book.ISBN.values[0]
input_title = input_book.title.values[0]
input_author = input_book.author.values[0]
              
choose_n = 5

# Bibliographic Similarity Selection

In [52]:
books['info'] = ( books.title + ' ' +
                 (books.author.fillna('') + ' ')*1 
                 + books.publisher.fillna('') + ' ' 
                 + books.year.astype(str) + ' ' 
                 + books.period1.astype(str) + ' ' 
                 + books.period2.astype(str))
#Here possibly add weight by multiplying the number of times the info appears?
#But if I will use cosine similarity, this way of adding weights is probably useless (?): https://www.pinecone.io/learn/vector-similarity/

In [53]:
books[books['info'].isna()] #empty df
for i in books['info'][0:3]:
    print(i)

Unnamed: 0,ISBN,title,author,year,publisher,Image-URL-S,Image-URL-M,Image-URL-L,explicit_ratings,implicit_ratings,period1,period2,info


Classical Mythology Mark P. O. Morford Oxford University Press 2002 2000 1990
Clara Callan Richard Bruce Wright HarperFlamingo Canada 2001 2000 1990
Decision in Normandy Carlo D'Este HarperPerennial 1991 1990 1980


In [54]:
vectorizer = CountVectorizer()
term_matrix = vectorizer.fit_transform(books['info'])

In [55]:
len(books)
term_matrix.shape
#print(term_matrix[0:5])

271357

(271357, 117284)

In [56]:
#similarities = cosine_similarity(term_matrix, dense_output=False)

In [57]:
#Computing similarities of all books takes too long. Perhaps I could select one book and compute similarity of it with the remaining.

In [58]:
row_to_compare = term_matrix[input_index, :]
similarities = cosine_similarity(row_to_compare, term_matrix)

In [59]:
similarities
similarities.shape

#print(np.sort(similarities)[:,-10:-1])
#print(similarities.sort())

indices_of_largest = np.argsort(similarities)[:, -(100+1):-1]
indices_of_largest = np.flip(indices_of_largest.flatten())
#indices_of_largest

array([[0.        , 0.        , 0.09622504, ..., 0.        , 0.        ,
        0.        ]])

(1, 271357)

In [75]:
similarity_selection = books.iloc[indices_of_largest.flatten()].copy()
similar_books = similarity_selection[~(similarity_selection.title.str.contains(input_name))].iloc[0:choose_n][['title', 'author', 'year', 'publisher', 'explicit_ratings', 'implicit_ratings']]

# Explicit Ratings Selection

In [61]:
explicit_ratings = ratings[ratings.rating != 0].copy()
implicit_ratings = ratings[ratings.rating == 0].copy()
implicit_ratings.rating =+ 1

In [62]:
explicit_ratings.rating.value_counts().sort_index() #users mostly rate 5 and higher
#Potentially it might be useful to roup ratings into e.g. 4 clases: 1 to 4, 5+6, 7+8, 9+10

1       1770
2       2759
3       5996
4       8904
5      50974
6      36924
7      76457
8     103736
9      67541
10     78610
Name: rating, dtype: int64

In [63]:
#Converting explicit ratings into pivotted form in a sparse matrix (pd.pivot() cannot produce sparse it seems)
#kudos to this post: https://stackoverflow.com/questions/31661604/efficiently-create-sparse-pivot-tables-in-pandas
ISBN_c = CategoricalDtype(explicit_ratings.ISBN.unique(), ordered=False)
user_c = CategoricalDtype(explicit_ratings.user.unique(), ordered=False)

row = explicit_ratings.ISBN.astype(ISBN_c).cat.codes
col = explicit_ratings.user.astype(user_c).cat.codes
explicit_sparse_matrix = csr_matrix((explicit_ratings["rating"], (row, col)), 
                           shape=(ISBN_c.categories.size, user_c.categories.size),
                           dtype=int)

explicit_sparse_matrix
expl_index = ISBN_c.categories.get_loc(input_ISBN)

<185972x77805 sparse matrix of type '<class 'numpy.intc'>'
	with 433671 stored elements in Compressed Sparse Row format>

## Explicit Ratings - Cosine similarity

In [64]:
#Nearest Neighbors search
model = NearestNeighbors(n_neighbors = choose_n+1, algorithm = 'auto', metric='cosine')
model.fit(explicit_sparse_matrix)
distances, suggestions = model.kneighbors(explicit_sparse_matrix[expl_index,:], return_distance = True)
#Euclidean distance here does not work, because it takes 0s into consideration, so it outputs random books
#Cosine distace works, but does not take into consideration the size of vector elements - the reviews themselves, so in the end it 
#is the same as implicit rating.
#I was also thinking about a custom distance metric based on euclidean, but dividing the distance by # of common non-zero obs., 
#but didnt finished the idea
#I will try alternative where I will keep only users (columns) that rated input_book

NearestNeighbors(metric='cosine', n_neighbors=6)

In [65]:
explicit_ISBN_suggestions = ISBN_c.categories[list(suggestions[0])]
#books[books.ISBN.isin(explicit_ISBN_suggestions)]

## Explicit Ratings - Euclidean distance on common

In [66]:
_, col_indices, _ = find(explicit_sparse_matrix[expl_index,:])

#Create new sparse matrix with the users that rated input book
subset_sparse_matrix = explicit_sparse_matrix[:, col_indices].copy()

In [67]:
subset_sparse_matrix[expl_index,:]

<1x14 sparse matrix of type '<class 'numpy.intc'>'
	with 14 stored elements in Compressed Sparse Row format>

In [68]:
model = NearestNeighbors(n_neighbors = choose_n+1, algorithm = 'auto', metric='cosine')
model.fit(subset_sparse_matrix)
distances, suggestions = model.kneighbors(subset_sparse_matrix[expl_index,:], return_distance = True)

NearestNeighbors(metric='cosine', n_neighbors=6)

In [69]:
explicit_ISBN_suggestions = ISBN_c.categories[list(suggestions[0])]
#books[books.ISBN.isin(explicit_ISBN_suggestions)]
explicit_recommendations = books[(books.ISBN.isin(explicit_ISBN_suggestions))
                              & (~(books.title.str.contains(input_name)))][['title', 'author', 'year', 'publisher', 'explicit_ratings', 'implicit_ratings']]

# Implicit Ratings Selection

In [70]:
ISBN_c = CategoricalDtype(implicit_ratings.ISBN.unique(), ordered=False)
user_c = CategoricalDtype(implicit_ratings.user.unique(), ordered=False)

row = implicit_ratings.ISBN.astype(ISBN_c).cat.codes
col = implicit_ratings.user.astype(user_c).cat.codes
implicit_sparse_matrix = csr_matrix((implicit_ratings["rating"], (row, col)), 
                           shape=(ISBN_c.categories.size, user_c.categories.size),
                           dtype=int)

implicit_sparse_matrix
impl_index = ISBN_c.categories.get_loc(input_ISBN)

<246724x59517 sparse matrix of type '<class 'numpy.intc'>'
	with 716109 stored elements in Compressed Sparse Row format>

In [71]:
model = NearestNeighbors(n_neighbors = choose_n+1, algorithm = 'auto', metric='cosine')
model.fit(implicit_sparse_matrix)
distances, suggestions = model.kneighbors(implicit_sparse_matrix[impl_index,:], return_distance = True)

NearestNeighbors(metric='cosine', n_neighbors=6)

In [72]:
implicit_ISBN_suggestions = ISBN_c.categories[list(suggestions[0])]
implicit_recommendations = books[(books.ISBN.isin(implicit_ISBN_suggestions))
                              & (~(books.title.str.contains(input_name)))][['title', 'author', 'year', 'publisher', 'explicit_ratings', 'implicit_ratings']]

# Recommendation

In [76]:
print(f'Selected book: {input_title} by {input_author}')

print('Similar books')
similar_books

print('Other readers also liked:')
explicit_recommendations

print('Others were also interested in:')
implicit_recommendations

Selected book: Great Expectations (Bantam Classic) by Charles Dickens
Similar books


Unnamed: 0,title,author,year,publisher,explicit_ratings,implicit_ratings
74069,Oliver Twist (Bantam Classics),Charles Dickens,1982,Bantam,3.0,11.0
140163,Nicholas Nickleby,Charles Dickens,1982,Bantam Books,,6.0
75858,Robinson Crusoe (Bantam Classic),Daniel Defoe,1982,Bantam,3.0,4.0
105926,A Christmas Carol (Bantam Classic),CHARLES DICKENS,1986,Bantam,7.0,6.0
29976,David Copperfield (Bantam Classics),Charles Dickens,1981,Bantam,2.0,7.0


Other readers also liked:


Unnamed: 0,title,author,year,publisher,explicit_ratings,implicit_ratings
1312,Anne Frank: The Diary of a Young Girl,ANNE FRANK,1993,Bantam,77.0,62.0
12700,Reservation Blues,Sherman Alexie,1996,Warner Books,17.0,11.0
19759,Rose,Martin Cruz Smith,1997,Ballantine Books,7.0,10.0
34768,Undaunted Courage: Meriwether Lewis Thomas Jef...,Stephen Ambrose,1997,Simon &amp; Schuster,16.0,16.0
55011,More Than You Know: A Novel,Beth Gutcheon,2001,Perennial,14.0,20.0


Others were also interested in:


Unnamed: 0,title,author,year,publisher,explicit_ratings,implicit_ratings
36518,Life Sixty Years: A 60th Anniversary Celebrati...,The Editors of Life,1996,Little Brown &amp; Co,2.0,3.0
85912,"Cathedral, Forge, and Waterwheel: Technology a...",Frances Gies,1994,Harpercollins,,3.0
89116,Crimson Stain,Jim Fisher,2000,Berkley Publishing Group,1.0,3.0
89138,The Black Violin : A Novel,Maxence Fermine,2003,Atria Books,2.0,2.0
89563,Hard Times: For These Times,Charles Dickens,1997,Signet Classics,1.0,2.0
