In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
books_info = pd.read_csv("/kaggle/input/bookcrossing-dataset/Book reviews/Book reviews/BX_Books.csv",sep= ';', encoding= 'latin-1')
book_ratings =pd.read_csv("/kaggle/input/bookcrossing-dataset/Book reviews/Book reviews/BX-Book-Ratings.csv",sep= ';', encoding= 'latin-1')
users = pd.read_csv("/kaggle/input/bookcrossing-dataset/Book reviews/Book reviews/BX-Users.csv",sep= ';', encoding= 'latin-1')

In [3]:
books_info.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
book_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


## Collabrative Filtering

<p>Collaborative Filtering recommends items based on similarity measures between users and/or items. 
    The basic assumption behind the algorithm is that users with similar interests have common preferences.
    This Method classify users into clusters of similar types and recommend each user according to the preference of its cluster<p>

Types of Collabrative Filtering:-<br>
    
1. Memory Based :- Finds similar users based on cosine Similiarity or Pearson Correlation and take weighted average of ratings. 
    1. User Based Filtering:-  recommends items for Users based on Similar Users
    2. Item Based Filtering:-  recommends items based on user's ratings
2. Model Based :- Use ML alogrithms to find the ratings of unrated items

In [6]:
## feature selection
''' merge books_info and book_ratings on ISBN'''
final_df = pd.merge(books_info,book_ratings,how='inner',left_on='ISBN',right_on='ISBN')[['Book-Title','Book-Rating','User-ID']]

In [7]:
final_df.shape

(1031175, 3)

In [8]:
print('No of Different Books:',len(final_df['Book-Title'].value_counts()))
print('No of Different Users:',len(final_df['User-ID'].value_counts()))

No of Different Books: 241090
No of Different Users: 92107


**Matrix size is going to be 92107 * 241090**

## Item-Based Filtering Using Nearest Neighbours


In [9]:
'''
Considering Books which got rated by atleast 80 diferent users
'''
books_rated = pd.DataFrame(final_df.groupby('Book-Title')['Book-Rating'].count()).rename(columns={'Book-Rating':'Total_Count'})
books_rated.reset_index(level = 0,inplace = True)
final = pd.merge(final_df,books_rated,how='inner',left_on='Book-Title',right_on='Book-Title')
final = final[final['Total_Count']>80]
final.shape

(214251, 4)

In [10]:
## Data preparation for Collabrative Filtering
'''
 Prepare a matrix
  rows:- Users
  columns:- Books
  values :- ratings
'''

matrix = pd.pivot_table(data=final,index='Book-Title',columns='User-ID',values='Book-Rating')
matrix.head()

User-ID,8,9,14,16,17,26,32,39,42,44,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2010: Odyssey Two,,,,,,,,,,,...,,,,,,,,,,
24 Hours,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,,,,,,,,,,...,,,,,,,,,,


In [11]:
'''
fill Null values with 0
'''
matrix.fillna(0,inplace=True)
matrix.head()

User-ID,8,9,14,16,17,26,32,39,42,44,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from scipy.sparse import csr_matrix
csr_data = csr_matrix(matrix.values)
matrix.reset_index(inplace=True)

In [13]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
knn.fit(csr_data)

In [14]:
def recommend(book_name):
    book_list = matrix[matrix['Book-Title'] == book_name]
    if len(book_list) == 0:
        return 'No Matching books found'
        
    book_idx = book_list.index[0]
    distances , indices = knn.kneighbors(csr_data[book_idx],n_neighbors=10)
    
    book_names = []    
    for i in indices[0]:
        name = matrix.iloc[i]['Book-Title']
        book_names.append(name)
    #data = [book_names,distances[0]]
    k= pd.DataFrame( {"Movies":book_names,"Liking_to_view":distances[0]})
    return k

recommend('1984')

Unnamed: 0,Movies,Liking_to_view
0,1984,0.0
1,Animal Farm,0.849867
2,Brave New World,0.87737
3,American Psycho (Vintage Contemporaries),0.911973
4,Slaughterhouse Five or the Children's Crusade:...,0.916112
5,Lord of the Flies,0.916221
6,"The Vampire Lestat (Vampire Chronicles, Book II)",0.918295
7,The Catcher in the Rye,0.918817
8,"The Drawing of the Three (The Dark Tower, Book 2)",0.927892
9,The Hitchhiker's Guide to the Galaxy,0.928441


## User based Filtering using Cosine Similiarity

In [15]:
'''
Considering users who rated more than 1000 Books
'''
users_rated = pd.DataFrame(final_df.groupby('User-ID')['Book-Rating'].count()).rename(columns={'Book-Rating':'Total_Count'})
users_rated.reset_index(level = 0,inplace = True)
final = pd.merge(final_df,users_rated,how='inner',left_on='User-ID',right_on='User-ID')
final = final[final['Total_Count']>100]
final.shape

(591914, 4)

In [16]:
user_matrix = pd.pivot_table(data=final,values='Book-Rating',index='User-ID',columns='Book-Title')
user_matrix.fillna(0,inplace=True)
user_matrix.head()

Book-Title,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",Always Have Popsicles,Apple Magic (The Collector's series),Beyond IBM: Leadership Marketing and Finance for the 1990s,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),Dark Justice,Deceived,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",Final Fantasy Anthology: Official Strategy Guide (Brady Games),Flight of Fancy: American Heiresses (Zebra Ballad Romance),...,Ã?ngeles fugaces (Falling Angels),Ã?Â?. Kolumnen.,Ã?Â?ber das Fernsehen.,Ã?Â?ber die Freiheit.,Ã?Â?ber die Pflicht zum Ungehorsam gegen den Staat.,Ã?Â?berraschung am Valentinstag.,Ã?Â?lpiraten.,Ã?Â?rger mit Produkt X. Roman.,Ã?Â?stlich der Berge.,Ã?Â?thique en toc
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# User Similarity Matrix using Cosine similarity as a similarity measure between Users
user_similarity = cosine_similarity(user_matrix)
user_similarity[np.isnan(user_similarity)] = 0
print(user_similarity)
print(user_similarity.shape)

[[1.         0.06041905 0.         ... 0.         0.         0.        ]
 [0.06041905 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
(1648, 1648)


In [18]:
user_predicted_ratings = np.dot(user_similarity, user_matrix)
# user_predicted_ratings

In [19]:
final_similarity_matrix = np.multiply(user_predicted_ratings,user_matrix)

In [20]:
## Books for for user no 256
final_similarity_matrix.iloc[256].sort_values(ascending=False)[0:9]

Book-Title
Five Pennies: A Prairie Boy's Story                             100.000000
The Christmas Cross                                             100.000000
Basic Black: The Wit and Whimsy of Arthur Black                  81.794500
For the record, Canada's greatest women athletes                 81.000000
War and Peace                                                    67.529193
The Moneychangers                                                66.426815
Feel the Fear and Do It Anyway                                   65.783138
Complete Illustrated Lewis Carroll ((Wordsworth Collection))     65.105106
FINAL DIAGNOSIS, THE                                             65.022676
Name: 44089, dtype: float64