In [2]:
# Import necessary packages
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
reduced_books_users_ratings_locations = pd.read_csv("https://raw.githubusercontent.com/GoldbergData/Machine-Learning-Book-Ratings/master/data/clean/reduced_books_users_ratings_locations.csv")
reduced_books_users_ratings_locations.head(2)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn,age,city,state,country
0,16877,038550120X,9,A Painted House,JOHN GRISHAM,2001,Doubleday,038550120X,37.0,houston,arkansas,usa
1,16877,034539657X,7,Dark Rivers of the Heart,Dean R. Koontz,1995,Ballantine Books,034539657X,37.0,houston,arkansas,usa


### Part I: kNN Recommender Algorithm - Books 

#### Combine user_id, unique_isbn, book_rating and book_title into one dataset

In [4]:
#reduced_books_users_ratings = reduced_books_users_ratings.drop(['book_author','year_of_publication','publisher','isbn','age','city','state','country','dropornot','dropuser'], axis = 1)
reduced_books_users_ratings_locations.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn,age,city,state,country
0,16877,038550120X,9,A Painted House,JOHN GRISHAM,2001,Doubleday,038550120X,37.0,houston,arkansas,usa
1,16877,034539657X,7,Dark Rivers of the Heart,Dean R. Koontz,1995,Ballantine Books,034539657X,37.0,houston,arkansas,usa
2,16877,743211383,3,Dreamcatcher,Stephen King,2001,Scribner,61083259,37.0,houston,arkansas,usa
3,16877,786868716,10,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,786868716,37.0,houston,arkansas,usa
4,16877,440159016,10,Motherhood: The Second Oldest Profession,Erma Bombeck,1987,Dell,70064547,37.0,houston,arkansas,usa


#### Create a column for Ratings count and group by Book Title

In [6]:
book_total_rating = (reduced_books_users_ratings_locations.
     groupby(by = ['book_title'])['book_rating'].
     count().
     reset_index().
     rename(columns = {'book_rating': 'total_rating_count'})
     [['book_title', 'total_rating_count']]
    )
book_total_rating.head()

Unnamed: 0,book_title,total_rating_count
0,'Salem's Lot,19
1,10 Lb. Penalty,15
2,100 Selected Poems by E. E. Cummings,6
3,101 Dalmatians,9
4,11-Sep,13


#### Combine both datasets

In [11]:
book_rating_count_user = reduced_books_users_ratings_locations.merge(book_total_rating, left_on = 'book_title', right_on = 'book_title', how = 'left')
book_rating_count_user.head()

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn,age,city,state,country,total_rating_count
0,16877,038550120X,9,A Painted House,JOHN GRISHAM,2001,Doubleday,038550120X,37.0,houston,arkansas,usa,237
1,16877,034539657X,7,Dark Rivers of the Heart,Dean R. Koontz,1995,Ballantine Books,034539657X,37.0,houston,arkansas,usa,32
2,16877,743211383,3,Dreamcatcher,Stephen King,2001,Scribner,61083259,37.0,houston,arkansas,usa,150
3,16877,786868716,10,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,786868716,37.0,houston,arkansas,usa,168
4,16877,440159016,10,Motherhood: The Second Oldest Profession,Erma Bombeck,1987,Dell,70064547,37.0,houston,arkansas,usa,6


In [12]:
book_rating_count_user = book_rating_count_user.drop(['isbn','book_author','year_of_publication','publisher','age','city','state','country'], axis = 1)
book_rating_count_user.head()

Unnamed: 0,user_id,book_rating,book_title,unique_isbn,total_rating_count
0,16877,9,A Painted House,038550120X,237
1,16877,7,Dark Rivers of the Heart,034539657X,32
2,16877,3,Dreamcatcher,61083259,150
3,16877,10,The Five People You Meet in Heaven,786868716,168
4,16877,10,Motherhood: The Second Oldest Profession,70064547,6


#### Look at statistics of the total ratings count

In [13]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_total_rating['total_rating_count'].describe())

count   5580.000
mean      20.843
std       25.542
min        1.000
25%        9.000
50%       13.000
75%       22.000
max      431.000
Name: total_rating_count, dtype: float64


In [14]:
print(book_total_rating['total_rating_count'].quantile(np.arange(.9, 1, .01)))

0.900    41.000
0.910    44.000
0.920    48.000
0.930    52.000
0.940    57.000
0.950    62.000
0.960    71.000
0.970    83.000
0.980    99.420
0.990   135.210
Name: total_rating_count, dtype: float64


In [48]:
popularity_threshold = 135 # to ensure statistical significance ==> top 5 % 
rating_by_popularity = book_rating_count_user.query('total_rating_count >= @popularity_threshold')
print(rating_by_popularity.shape)
rating_by_popularity.head()

(10976, 5)


Unnamed: 0,user_id,book_rating,book_title,unique_isbn,total_rating_count
0,16877,9,A Painted House,038550120X,237
2,16877,3,Dreamcatcher,61083259,150
3,16877,10,The Five People You Meet in Heaven,786868716,168
6,20806,6,A Painted House,038550120X,237
10,21340,9,A Painted House,038550120X,237


#### Implement Nearest Neighbors - Brute Force algorithm
* Transform the Rating values of the matrix dataframe into a scipy sparse matrix for more efficient calculations.
* The algorithm will calculate the cosine similarity between rating vectors using cosine metric and brute algorithm

In [49]:
# Drop duplicates
rating_by_popularity = rating_by_popularity.drop_duplicates(['user_id', 'book_title'])

# Fill missing values with 0 for vector calculation
rating_by_popularity_pivot = rating_by_popularity.pivot(index = 'book_title', columns = 'user_id', values = 'book_rating').fillna(0)

# Create a scipy sparse matrix nd convert data into Compressed Sparse Row format
rating_by_popularity_matrix = csr_matrix(rating_by_popularity_pivot.values)

# Initialize the model, implement NearestNeighbors unsupervised learner 
knn_nn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

# Fit the model
knn_nn.fit(rating_by_popularity_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

#### Test the model's performance and make recommendations
kNN calculates the Euclidian distance to measure how close observations are to each other 

In [50]:
query_index = np.random.choice(rating_by_popularity_pivot.shape[0])

# Try with 6 nearest neighbors. Use iloc to locate observations that match the query_index
distances,indices = knn_nn.kneighbors(rating_by_popularity_pivot.iloc[query_index,:].values.reshape(1,-1),n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(rating_by_popularity_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, rating_by_popularity_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Harry Potter and the Goblet of Fire (Book 4):

1: Harry Potter and the Prisoner of Azkaban (Book 3), with distance of 0.36073555423459214:
2: Harry Potter and the Chamber of Secrets (Book 2), with distance of 0.42312316132775796:
3: Harry Potter and the Order of the Phoenix (Book 5), with distance of 0.5377408875559387:
4: Harry Potter and the Sorcerer's Stone (Book 1), with distance of 0.5840574465910896:
5: Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)), with distance of 0.7428749617065913:


#### Cross - Validation
* Because kNN does not have traditional training and testing phases as it uses the entire dataset for training and then calculates nearest neighbors for each new observation it is hard to measure its accruacy.
* We are going to use k-fold and LOOCV for validation that automatically split the entire dataset into train and test

In [26]:
# K-fold Cross-Validation
kf = KFold(n_splits=2)
for train, test in kf.split(rating_by_popularity_matrix):
    print("%s %s" % (train, test))

[147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
 291 292 293] [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98 

In [27]:
# LOO Cross-Validation
loo = LeaveOneOut()
for train, test in loo.split(rating_by_popularity_matrix):
    print("%s %s" % (train, test))

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 246 24

 289 290 291 292 293] [119]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 24

 289 290 291 292 293] [255]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 23