In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
#from sklearn.metrics import accuracy
from sklearn.cluster import KMeans

In [150]:
books = pd.read_csv('Books.csv', dtype = 'object')
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [285]:
books.set_index('ISBN', inplace = True)
books.head()

Unnamed: 0_level_0,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [154]:
users = pd.read_csv('Users.csv', dtype={'User-ID': 'object', 'Location': 'object', 'Age': 'float64'})
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  object 
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.4+ MB


In [8]:
users = pd.read_csv('Users.csv')
users.head()

Unnamed: 0_level_0,Location,Age
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"nyc, new york, usa",
2,"stockton, california, usa",18.0
3,"moscow, yukon territory, russia",
4,"porto, v.n.gaia, portugal",17.0
5,"farnborough, hants, united kingdom",


In [151]:
ratings = pd.read_csv('Ratings.csv', dtype= {'User-ID': 'object', 'ISBN': 'object', 'Book-Rating': 'int64'})
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  object
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 26.3+ MB


In [152]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


#### Finding book by ISBN

In [14]:
def what_book(isbn):
    print(f"This book is '{books.loc[isbn, 'Book-Title']}' by {books.loc[isbn, 'Book-Author']}, published in {books.loc[isbn, 'Year-Of-Publication']} by {books.loc[isbn, 'Publisher']}.")
    return books.loc[isbn, 'Book-Title'], books.loc[isbn, 'Book-Author'], books.loc[isbn, 'Year-Of-Publication'], books.loc[isbn, 'Publisher']

In [13]:
what_book('0374157065')

This book is 'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It' by Gina Bari Kolata, published in 1999 by Farrar Straus Giroux


('Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
 'Gina Bari Kolata',
 1999,
 'Farrar Straus Giroux')

#### Finding n most favorites by user

In [34]:
ratings.groupby('User-ID')['ISBN'].count()

User-ID
2          1
7          1
8         18
9          3
10         2
          ..
278846     2
278849     4
278851    23
278852     1
278854     8
Name: ISBN, Length: 105283, dtype: int64

In [42]:
test = ratings[ratings['User-ID']==278851].sort_values('Book-Rating', ascending = False).head(3)
test = test.merge(books[['Book-Title', 'Book-Author']], right_index = True, left_on = 'ISBN')

In [43]:
test

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
9550,278851,1569661057,10,"Dallas Street Map Guide and Directory, 2000 Ed...",Mapsco
9548,278851,1558531025,8,Life's Little Instruction Book (Life's Little ...,H. Jackson Brown
9547,278851,914511211,8,1996-97 Texas Almanac and State Industrial Gui...,Dallas Morning News


In [56]:
def fav_books(user_id, n):
    try:
        n_most_fav = ratings[ratings['User-ID']==user_id].sort_values('Book-Rating', ascending = False).head(3)
        n_most_fav_books = n_most_fav.merge(books[['Book-Title', 'Book-Author']], right_index = True, left_on = 'ISBN')
    except:
        n_most_fav = None
        n_most_fav_books = None
    
    return n_most_fav_books

In [58]:
fav_books_test = fav_books(204622, 3)
fav_books_test

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author
844935,204622,671027360,10,Angels &amp; Demons,Dan Brown
844926,204622,385504209,10,The Da Vinci Code,Dan Brown
844955,204622,967560500,10,Natural Hormonal Enhancement,Rob Faigin


#### Converting ratings from a table to a Users x Books matrix

In [72]:
cnt_by_isbn = ratings.groupby('ISBN')['Book-Rating'].count().reset_index()
isbns = cnt_by_isbn[cnt_by_isbn['Book-Rating']> cnt_by_isbn['Book-Rating'].quantile(0.8)]['ISBN']

cnt_by_uid = ratings.groupby('User-ID')['Book-Rating'].count().reset_index()
users = cnt_by_uid[cnt_by_uid['Book-Rating']> cnt_by_uid['Book-Rating'].quantile(0.8)]['User-ID']

In [76]:
ratings[ratings['ISBN'].isin(isbns)]

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
2,276727,0446520802,0
6,276736,3257224281,8
8,276744,038550120X,7
10,276746,0425115801,0
...,...,...,...
1149771,276704,0743211383,7
1149772,276704,080410526X,0
1149776,276706,0679447156,0
1149777,276709,0515107662,10


In [82]:
ratings_pareto = ratings[(ratings['ISBN'].isin(isbns)) & (ratings['User-ID'].isin(users)) & (ratings['Book-Rating']>0)]

In [84]:
ratings_pareto.shape

(227333, 3)

In [85]:
ur_matrix = ratings_pareto.pivot_table(index = 'User-ID', columns = 'ISBN', values = 'Book-Rating')

In [86]:
ur_matrix.head()

ISBN,0 907 062 008,00000000,000000000,0000000000,00000000000,000000000000,0000000000000,0001055666,000200092,0002005018,...,9895550065,9895550138,9895550738,B00005W8DZ,B00009EF82,B0000AA9IZ,B0000E63CJ,B158991965,M79702002,O67174142X
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,5.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
53,,,,,,,,,,,...,,,,,,,,,,
69,,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,


#### Finding similar users

In [98]:
def distance_users(user1, user2):
    user1vec = ur_matrix[ur_matrix.index == user1]
    user2vec = ur_matrix[ur_matrix.index == user2]
    distance_den = ur_matrix.shape[1]
    distance_nom = ur_matrix.shape[1]
    for _ in range(ur_matrix.shape[1]):
        try:
            if user1vec[_] == user2vec[_]:
                distance_nom -= 1
            elif user1vec[_].isna() and user2vec[_].isna():
                distance_nom -= 1
            else:
                pass
        except:
            pass
    return distance_nom/distance_den

In [100]:
distance_users(255489, 204622)

1.0

In [122]:
ur_matrix_slim = ur_matrix.sample(frac = 0.1)

In [124]:
def find_n_similar(user, n):
    distances = {}
    for other_user in ur_matrix_slim.index: # switched from ur_matrix to ur_matrix.slim for testing because it took too long to run the function on the entire set of users
        if other_user == user:
            pass
        else:
            distances[other_user] = distance_users(user, other_user)
    distances_df = pd.DataFrame(data = distances.values(), index = distances.keys(), columns = ['dist']).sort_values(['dist'])    
            
    return list(distances_df.head(n).index)

In [155]:
find_n_similar(225433,10)

[211307, 208786, 262635, 26435, 46862, 215094, 131884, 203491, 89558, 4221]

## Finding books to recommend

### Collaborative Filtering Method: Nearest Neighbors

In [305]:
def find_m_recommendations(user, n, m):
    books_read = list(ur_matrix[ur_matrix.index == user].dropna(axis = 1).columns)
    #similar_users = find_n_similar(user, n)
    similar_users = [211307, 208786, 262635, 26435, 46862, 215094, 131884, 203491, 89558, 4221]
    avg_ratings = ur_matrix[ur_matrix.index.isin(similar_users)].aggregate(['mean']).drop(columns = books_read)
    avg_ratings_sorted = avg_ratings.T.sort_values('mean', ascending = False)
    top_m = avg_ratings_sorted.index[:m]
    m_books = books[books.index.isin(top_m)][['Book-Title', 'Book-Author']]
    return m_books

In [306]:
find_m_recommendations(225433, 3, 10)

Unnamed: 0_level_0,Book-Title,Book-Author
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
440241413,Confessions of a Shopaholic,SOPHIE KINSELLA
1558743669,"A Child Called \It\"": One Child's Courage to S...",Dave Pelzer
373250193,Loose Screws (Red Dress Ink (Paperback)),Karen Templeton
8495501465,Dios vuelve ne una Harley,Joan Brady
451146077,The Early Ayn Rand: A Selection from Her Unpub...,Ayn Rand
451158601,We the Living,Ayn Rand
452273331,The Fountainhead,Ayn Rand
425090825,Callahans Secret,Spider Robinson


### Collaborative Filtering Method: Latent Factors

In [173]:
from scipy.sparse import coo_matrix
from numpy.linalg import norm

In [159]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [160]:
ratings['User-ID'] = ratings['User-ID'].astype('category')
ratings['ISBN'] = ratings['ISBN'].astype('category')

In [166]:
ratings_mtrx = coo_matrix((ratings['Book-Rating'].astype(float), (ratings['User-ID'].cat.codes.copy(), ratings['ISBN'].cat.codes.copy())))

In [180]:
type(ratings_mtrx)

scipy.sparse.coo.coo_matrix

In [206]:
print(f"The sparse matrix has {ratings_mtrx.shape[0]} rows and {ratings_mtrx.shape[1]} columns, compared to the full matrix which would have had {ratings['User-ID'].nunique()} rows and {ratings['ISBN'].nunique()} columns.")

The sparse matrix has 105283 rows and 340556 columns, compared to the full matrix which would have had 105283 rows and 340556 columns.


In [171]:
num_usrs = ratings_mtrx.shape[0]
num_bks = ratings_mtrx.shape[1]
num_ftrs = 3

In [174]:
usrs_fctr_mtrx = np.random.rand(num_usrs, num_ftrs)
fctr_bks_mtrx = np.random.rand(num_ftrs, num_bks)

In [183]:
def est_error(ratings_matrix, users_factor_matrix, factor_books_mtrx, l = 0.05):
    ratings = ratings_matrix.data
    rrows = ratings_matrix.row
    rcols = ratings_matrix.col
    err = 0
    for uir in range(len(ratings)):
        rtg_ui = ratings[uir]
        usr = rrows[uir]
        bk = rcols[uir]
        if rtg_ui>0:
            err = err + (rtg_ui - np.dot(users_factor_matrix[usr,:], factor_books_mtrx[:,bk]))**2 + l * (norm(users_factor_matrix[usr,:])**2 + norm(factor_books_mtrx[:,bk])**2)
    
    return err

In [186]:
rmse = np.sqrt(est_error(ratings_mtrx, usrs_fctr_mtrx, fctr_bks_mtrx)/len(ratings_mtrx.data))

In [187]:
rmse

4.365920608994496

In [194]:
def min_error(ratings_mtrx, num_ftrs, tgt_err, l = 0.05, g = 0.001, steps = 2):
    num_usrs, num_bks = ratings_mtrx.shape
    ratings = ratings_mtrx.data
    rrows = ratings_mtrx.row
    rcols = ratings_mtrx.col
    usrs_fctr_mtrx = np.random.rand(num_usrs, num_ftrs)
    fctr_bks_mtrx = np.random.rand(num_ftrs, num_bks)
    rmse = np.sqrt(est_error(ratings_mtrx, usrs_fctr_mtrx, fctr_bks_mtrx)/len(ratings_mtrx.data))
    print(f"Initial RMSE = {round(rmse, 4)}")
    for step in range(steps):
        for uir in range(len(ratings)):
            rtg_ui = ratings[uir]
            usr = rrows[uir]
            bk = rcols[uir]
            if rtg_ui>0:
                err_ui = rtg_ui - np.dot(usrs_fctr_mtrx[usr,:], fctr_bks_mtrx[:,bk])
                usrs_fctr_mtrx[usr,:] = usrs_fctr_mtrx[usr,:]+ 2*g*(err_ui*fctr_bks_mtrx[:,bk] - l*usrs_fctr_mtrx[usr,:])
                fctr_bks_mtrx[:,bk] = fctr_bks_mtrx[:,bk] + 2*g*(err_ui*usrs_fctr_mtrx[usr,:] - l*fctr_bks_mtrx[:,bk])
        rmse = np.sqrt(est_error(ratings_mtrx, usrs_fctr_mtrx, fctr_bks_mtrx)/len(ratings_mtrx.data))
        if rmse <= tgt_err:
            break
    print(f"Final RMSE = {round(rmse,4)}")
    return usrs_fctr_mtrx, fctr_bks_mtrx

In [198]:
ufm, fbm = min_error(ratings_mtrx, 3, 0.7, steps = 100)

Initial RMSE = 4.367
Final RMSE = 0.9136


In [199]:
ufm

array([[1.43670295, 0.45039194, 0.5592094 ],
       [0.42469244, 0.15163147, 0.33649315],
       [0.81869506, 0.51661862, 0.23369282],
       ...,
       [1.53112819, 2.32346092, 2.07688938],
       [1.82917229, 1.78169822, 2.55172133],
       [0.99768944, 0.59352088, 0.83545333]])

In [200]:
fbm

array([[0.49312816, 0.1813821 , 0.98486762, ..., 0.11146441, 1.87779885,
        1.13909225],
       [1.14099152, 0.38985203, 0.76432055, ..., 0.3963618 , 0.76809651,
        1.13167044],
       [1.4709615 , 0.55657442, 0.72933628, ..., 0.31188919, 0.93482453,
        0.91595938]])

In [201]:
ufm.shape

(105283, 3)

In [203]:
fbm.shape

(3, 340556)

In [202]:
ratings_mtrx.shape

(105283, 340556)

In [303]:
def find_n_recommendations(user_id, n):
    row = list(ratings['User-ID'].unique()).index(user_id) # determining which row in the user-factor matrix corresponds to user_id
    read = list(ratings[(ratings['User-ID'] == user_id) & (ratings['Book-Rating']>0)]['ISBN']) # determining which books the user has read, so as to not recommend them again
    bk_rtngs = np.dot(ufm[row], fbm) # calculating user ratings for all books in the library
    bks = pd.DataFrame(index = ratings['ISBN'].unique(), columns = ['rating'], data = bk_rtngs.transpose()).sort_values(['rating'], ascending = False) # sorting books by rating
    unrd_bks = bks[~bks.index.isin(read)]
    unrd_bks = unrd_bks.merge(books[['Book-Title', 'Book-Author']], left_index = True, right_index = True).drop(['rating'], axis = 1) # pulling in the titles and authors to display
    return unrd_bks.head(n)

In [304]:
recs = find_n_recommendations('225433',10)
recs

Unnamed: 0,Book-Title,Book-Author
070641537X,Dictionary of Quotations and Proverbs Everyman E,D C Browning
0515063924,A Heart Is Broken,Barbara Cartland
082174206X,Stolen Fire (Heartfire Romance),Danette Chartier
0451204530,Code to Zero,Ken Follett
0843920572,Illegal Entry,Ralph Hayes
0440206405,What You Can Do About Diabetes (Dell Medical L...,Norra Tannenhaus
1576731057,Arabian Winds,Linda Chaikin
006091095X,"Stitches, patterns, and projects for crochetin...",Wanda Bonando
2253056014,Dedale,Collins
0743223438,"The Thurber Letters: The Wit, Wisdom and Surpr...",James Thurber


### Using Apriori Association Rules

***

In [138]:
rated = ur_matrix[ur_matrix.index.isin([211307, 208786, 262635])].aggregate(['mean']).drop(columns = ['0349113602', '057120175X'])

In [142]:
rated.T.sort_values('mean', ascending= False).index[:5]

Index(['0425090825', '0064400557', '014034991X', '0345377702', '0440428130'], dtype='object')

In [131]:
ur_matrix.

ISBN,0 907 062 008,00000000,000000000,0000000000,00000000000,000000000000,0000000000000,0001055666,000200092,0002005018,...,9895550065,9895550138,9895550738,B00005W8DZ,B00009EF82,B0000AA9IZ,B0000E63CJ,B158991965,M79702002,O67174142X
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,5.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
53,,,,,,,,,,,...,,,,,,,,,,
69,,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
