In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import sklearn.metrics as metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [2]:
books = pd.read_csv('csv/BX-Books.csv', error_bad_lines=False, sep=';')
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)

users = pd.read_csv('csv/BX-Users.csv', error_bad_lines=False, sep=';')

ratings = pd.read_csv('csv/BX-Book-Ratings.csv', error_bad_lines=False, sep=';')


b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [3]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [4]:
books.drop(books.loc[books.ISBN == '0789466953'].index, inplace = True )
books.drop(books.loc[books.ISBN == '078946697X'].index, inplace = True )
books.drop(books.loc[books.ISBN == '2070426769'].index, inplace = True )

In [5]:
books['Year-Of-Publication']=pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
books.loc[(books['Year-Of-Publication'] > 2020) | (books['Year-Of-Publication'] == 0),'Year-Of-Publication'] = np.NAN
books['Year-Of-Publication'].fillna(round(books['Year-Of-Publication'].mean()), inplace=True)
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int32)

In [6]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
users.loc[(users['Age'] > 100) | (users['Age'] < 5), 'Age'] = np.nan
users['Age'] = users['Age'].fillna(users['Age'].mean())
users['Age'] = users['Age'].astype(np.int32)


In [8]:
ratings_new = ratings[ratings['ISBN'].isin(books['ISBN'])]

In [9]:
n_users = users.shape[0]
n_books = books.shape[0]
1.0-len(ratings_new)/float(n_users*n_books)

0.9999863733178015

In [10]:
ratings_implicit = ratings_new[ratings_new['Book-Rating'] == 0]
ratings_explicit = ratings_new[ratings_new['Book-Rating'] != 0]


In [11]:
counts1 = ratings_explicit['User-ID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['User-ID'].isin(counts1[counts1 >= 20].index)]
counts = ratings_explicit['Book-Rating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['Book-Rating'].isin(counts[counts >= 20].index)]

In [12]:
ratings_matrix = ratings_explicit.pivot(index='User-ID', columns='ISBN', values='Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix

(3305, 108380)


ISBN,0000913154,0001046438,000104687X,0001047213,0001047973,000104799X,0001048082,0001053736,0001053744,0001055607,...,B0000T6KIM,B0000VZEH8,B0000VZEJQ,B0000X8HIE,B00011SOXI,B00013AX9E,B0001FZGRQ,B0001GMSV2,B0001I1KOG,B000234N3A
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
242,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
507,,,,,,,,,,,...,,,,,,,,,,
638,,,,,,,,,,,...,,,,,,,,,,
643,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278221,,,,,,,,,,,...,,,,,,,,,,
278356,,,,,,,,,,,...,,,,,,,,,,
278418,,,,,,,,,,,...,,,,,,,,,,
278582,,,,,,,,,,,...,,,,,,,,,,


In [13]:
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)

In [14]:
k=20
metric='correlation'
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric)
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)

    return 1-distances.flatten(), indices

def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    similarities, indices=findksimilaritems(item_id, ratings)
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == ratings.columns.get_loc(item_id):
            continue
        else:
            product = ratings.iloc[ratings.index.get_loc(user_id),indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/np.sum(similarities)-1))

    prediction = 10 if prediction > 10 else prediction
    prediction = 1 if prediction <= 0 else prediction

    # print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))   
    
    return prediction

In [None]:
np.corrcoef(ratings_matrix) 

In [26]:
np.corrcoef(ratings_matrix.T)

MemoryError: Unable to allocate 33.0 GiB for an array with shape (66574, 66574) and data type float64

In [None]:
cosine_similarity(ratings_matrix.T)

In [43]:
pd.set_option('display.max_colwidth', -1)
ratings_matrix.dtypes

ISBN
0000913154    int32
0001046438    int32
000104687X    int32
0001047213    int32
0001047973    int32
              ...  
B0000VZEJQ    int32
B0000X8HIE    int32
B00013AX9E    int32
B0001I1KOG    int32
B000234N3A    int32
Length: 66574, dtype: object

In [137]:
def recommendItem(user_id, ratings, metric=metric):
    prediction = []            
    for i in range(ratings.shape[1]):
        if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already
            prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))
        else:                    
            prediction.append(-1) #for already rated items
    prediction = pd.Series(prediction)
    prediction = prediction.sort_values(ascending=False)
    recommended = prediction[:10]
    ret_value = []
    for i in range(len(recommended)):
            print ("{0}. {2} ====> {1}".format(i+1, books['Book-Title'][recommended.index[i]].encode('utf-8'), books['ISBN'][recommended.index[i]]))         
            ret_value.append(books['ISBN'][recommended.index[i]])
    return ret_value    


In [138]:
my_recom = recommendItem(4385, ratings_matrix)

1. 1592247938 ====> b'The Red Badge of Courage'
2. 0380784130 ====> b'Fair Peril'
3. 1551664380 ====> b'Real Thing'
4. 0060154969 ====> b'Love, Medicine, and Miracles'
5. 0380013207 ====> b'The Lathe of Heaven'
6. 1888766034 ====> b"Hologram of Liberty: The Constitution's Shocking Alliance With Big Government"
7. 039585993X ====> b'An American Requiem : God, My Father, and the War That Came Between Us'
8. 0385189508 ====> b"Grimm's Tales for Young and Old"
9. 0440228433 ====> b"One Thousand Paper Cranes: The Story of Sadako and the Children's Peace Statue"
10. 0888012748 ====> b'Macaws of Death'


In [139]:
all_rates = ratings_explicit.loc[ratings_explicit['User-ID'] == 4385]


In [150]:
tp=0
tn=0
fp=0
fn=0
for index, row in all_rates.sort_values(by=['Book-Rating'], ascending=False).iterrows():
    if row['ISBN'] in my_recom:
        tp+=1
    else:
        fn+=1
    
print(tp, tn, fp, fn)



0 0 0 212


In [151]:
all_rates

Unnamed: 0,User-ID,ISBN,Book-Rating
21689,4385,0061083402,10
21691,4385,0061093343,9
21692,4385,0061096156,10
21694,4385,0312956762,10
21695,4385,0312980353,10
...,...,...,...
22013,4385,1551660016,10
22016,4385,155166531X,10
22017,4385,1551666014,10
22018,4385,1551667436,10
