In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
user_df = pd.read_csv('BX-Users.csv',encoding='latin-1')

In [4]:
user_df

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278854,278854,"portland, oregon, usa",
278855,278855,"tacoma, washington, united kingdom",50.0
278856,278856,"brampton, ontario, canada",
278857,278857,"knoxville, tennessee, usa",


In [5]:
user_df.shape

(278859, 3)

In [6]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278859 non-null  object 
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.4+ MB


In [7]:
books_df =  pd.read_csv('BX-Books.csv',encoding='latin-1')

In [8]:
books_df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [9]:
books_df.shape

(271379, 5)

# Clean up NaN values

In [10]:
user_df.isnull().sum()

user_id          0
Location         1
Age         110763
dtype: int64

In [11]:
books_df.isnull().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

In [12]:
user_df= user_df.dropna(axis=0)

In [13]:
user_df.isnull().sum()

user_id     0
Location    0
Age         0
dtype: int64

# Read the data where ratings are given by users

In [14]:
books_rating_df = pd.read_csv('BX-Book-Ratings.csv',encoding='latin-1',nrows=10000)

In [15]:
books_rating_df

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6
...,...,...,...
9995,243,425164403,0
9996,243,440224764,0
9997,243,440225701,0
9998,243,440226430,0


In [16]:
books_rating_df.shape

(10000, 3)

In [17]:
books_rating_df.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

Merging two datasets

In [18]:
df = pd.merge(books_df,books_rating_df, on='isbn')

In [19]:
df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,8,0
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,8,0
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,8,0


In [20]:
df.shape

(8701, 7)

In [21]:
df.tail()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating
8696,767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001,Broadway Books,278851,5
8697,884159221,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985,Lone Star Books,278851,7
8698,912333022,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,1997,Kqed Books,278851,7
8699,1569661057,"Dallas Street Map Guide and Directory, 2000 Ed...",Mapsco,1999,American Map Corporation,278851,10
8700,345251547,Mister God This Is Anna,Fynn,1976,Ballantine Books,277187,0


# Take a quick look at the number of unique users and books

In [22]:
users = df['user_id'].unique()

In [23]:
len(users)

828

In [24]:
# number of unique users
print('Number of unique user {}'.format(len(users)))

Number of unique user 828


In [25]:
books = df['isbn'].unique()
len(books)

8051

In [26]:
# number of unique books
print(f'number of uniqe books: {len(books)}')

number of uniqe books: 8051


# Convert the user_id variable to numeric numbers in the correct order

In [27]:
# convert user_id into numeric order
list_userid = df.user_id.unique()
print('Number of unique user {}'.format(len(list_userid)))

Number of unique user 828


In [28]:
list_userid

array([     2,      8, 277478, 278144,      9,    243, 277711, 278418,
           10,     12,     14, 276925, 276939, 276954, 277042, 277157,
       277378, 277427, 277530, 277932, 278137, 278176, 278373, 278514,
          193,    242, 277965,     16,     17,     19,     20,     22,
       277195, 277641,     23, 276953, 277743,     26, 277722, 277744,
           32,     36,     39,     42, 278390,     44, 278188, 277168,
       277922, 278333,     51,     53,     56,     64,     67, 278535,
       277689, 278221, 278843,     68,     69,     70,     73,     75,
       278851,     77,     78,     79,     81,     82,     83,     85,
           86,     87,     88, 277698,     91,     92, 277928,     95,
          125,     97, 278202,     99, 276755, 277647,    102,    107,
          109,    110,    114, 278153, 278561,    165, 277439, 278254,
       278422,    129,    132,    133,    135, 277187,    139,    141,
       278342,    144,    151,    160,    162, 277315, 278692,    169,
      

In [29]:
#ind = np.where(df.index==8696)[0]
#type(ind)

In [35]:
def user_id_num(user_id):
    itemindex = np.where(list_userid==user_id)
    return itemindex[0][0]

In [36]:
list_isbn = df.isbn.unique()

In [37]:
print('length of isbn unique values: {}'.format(len(list_isbn)))

length of isbn unique values: 8051


In [38]:
def isbn_num(isbn):
    itemindex = np.where(list_isbn==isbn)
    return itemindex[0][0]

# Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

In [39]:
df['user_id_order'] = df['user_id'].apply(user_id_num)

In [40]:
df['isbn_order'] = df['isbn'].apply(isbn_num)

In [41]:
df

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating,user_id_order,isbn_order
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0,0,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,1,1
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,8,0,1,2
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,8,0,1,3
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,8,0,1,4
...,...,...,...,...,...,...,...,...,...
8696,767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001,Broadway Books,278851,5,64,8046
8697,884159221,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985,Lone Star Books,278851,7,64,8047
8698,912333022,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,1997,Kqed Books,278851,7,64,8048
8699,1569661057,"Dallas Street Map Guide and Directory, 2000 Ed...",Mapsco,1999,American Map Corporation,278851,10,64,8049


# Re-index the columns to build a matrix

In [42]:
ordered_col = ['user_id_order', 'isbn_order', 'rating', 'book_title', 'book_author', 'year_of_publication','publisher',
               'user_id', 'isbn']
df = df.reindex(columns=ordered_col)

In [43]:
df.head()

Unnamed: 0,user_id_order,isbn_order,rating,book_title,book_author,year_of_publication,publisher,user_id,isbn
0,0,0,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,195153448
1,1,1,5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,2005018
2,1,2,0,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,8,60973129
3,1,3,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,8,374157065
4,1,4,0,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,8,393045218


# Split your data into two sets (training and testing)

In [44]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df,test_size=0.30,random_state=10)

In [45]:
train.shape

(6090, 9)

In [46]:
test.head()

Unnamed: 0,user_id_order,isbn_order,rating,book_title,book_author,year_of_publication,publisher,user_id,isbn
7794,7,7145,0,The Iceberg and It's Shadow,Jan Greenberg,1989,Farrar Straus &amp; Giroux,278418,374435502
5451,7,4822,0,Zen and the City of Angels,Elizabeth M. Cosin,1999,St. Martin's Press,278418,312206119
8288,7,7638,0,Microsoft Mastering - Ecommerce Development: B...,Microsoft Corporation,1999,Microsoft Pr,278418,735608911
2853,396,2303,9,A House for Mr. Biswas,V. S. Naipaul,2001,Vintage Books USA,277527,375707166
5348,7,4719,0,Batman Returns: The Novelization,Craig Shaw Gardner,1992,Warner Books,278418,446363030


In [47]:
test.shape

(2611, 9)

# Make predictions based on user and item variables

In [53]:
train_matrix = np.zeros((len(users), len(books)))
for line in train.itertuples():
    train_matrix[line[1]-1, line[2]-1]=line[3]

test_matrix = np.zeros((len(users), len(books)))
for line in test.itertuples():
    test_matrix[line[1]-1, line[2]-1]=line[3]

In [108]:
from sklearn.metrics.pairwise import pairwise_distances
user_correlation = pairwise_distances(train_matrix,metric='cosine')
item_correlation = pairwise_distances(train_matrix.T,metric='cosine')

In [110]:
item_correlation.shape

(8051, 8051)

In [111]:
user_correlation

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [112]:
train_matrix.shape

(828, 8051)

In [113]:
def predict(ratings,correlation,type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        rating_diff = (ratings - mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:,np.newaxis]+correlation.dot(rating_diff)/np.array([np.abs(correlation).sum(axis=1)]).T
    elif type=='item':
        pred = ratings.dot(correlation) / np.array([np.abs(correlation).sum(axis=1)])
    return pred

In [114]:
user_pred = predict(train_matrix,user_correlation,type='user')

In [115]:
user_pred.shape

(828, 8051)

In [116]:
item_pred = predict(train_matrix,item_correlation.T,type = 'item')

In [117]:
item_pred.T.shape

(8051, 828)

# Use RMSE to evaluate the predictions

In [118]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [119]:
print('user-based CF RMSE' + str(rmse(user_pred,test_matrix)))

user-based CF RMSE7.72185541212063


In [120]:
print('user-based CF RMSE' + str(rmse(item_pred,test_matrix)))

user-based CF RMSE7.721162031094573
