In [24]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import sklearn.metrics as metrics


In [4]:
books = pd.read_csv('csv/BX-Books.csv', error_bad_lines=False, sep=';')
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)

users = pd.read_csv('csv/BX-Users.csv', error_bad_lines=False, sep=';')

ratings = pd.read_csv('csv/BX-Book-Ratings.csv', error_bad_lines=False, sep=';')


b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [7]:
books['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 1968, 1961, 1958, 1974, 1976,
       1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960, 1966,
       1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954, 1950,
       1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011, 1925,
       1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 1911, 1904,
       1949, 1932, 1928, 1929, 1927, 1931, 1914, 1934, 1910, 1933, 1902,
       1924, 1921, 1900, 1944, 1917, 1901, 2010, 1908, 1906, 1935, 1806,
       2012, 2006, 1909, 2008, 1378, 1919, 1922, 1897, 1376], dtype=int32)

In [6]:
books.loc[books.ISBN == '0789466953','Year-Of-Publication'] = 2000
books.loc[books.ISBN == '0789466953','Book-Author'] = "James Buckley"
books.loc[books.ISBN == '0789466953','Publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','Book-Title'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"
books.loc[books.ISBN == '078946697X','Year-Of-Publication'] = 2000
books.loc[books.ISBN == '078946697X','Book-Author'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','Publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','Book-Title'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"
books.loc[books.ISBN == '2070426769','Year-Of-Publication'] = 2003
books.loc[books.ISBN == '2070426769','Book-Author'] = "Jean-Marie Gustave Le ClÃ?Â©zio"
books.loc[books.ISBN == '2070426769','Publisher'] = "Gallimard"
books.loc[books.ISBN == '2070426769','Book-Title'] = "Peuple du ciel, suivi de 'Les Bergers"

books['Year-Of-Publication']=pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
books.loc[(books['Year-Of-Publication'] > 2020) | (books['Year-Of-Publication'] == 0),'Year-Of-Publication'] = np.NAN
books['Year-Of-Publication'].fillna(round(books['Year-Of-Publication'].mean()), inplace=True)
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int32)

In [10]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
users.loc[(users['Age'] > 100) | (users['Age'] < 5), 'Age'] = np.nan
users['Age'] = users['Age'].fillna(users['Age'].mean())
users['Age'] = users['Age'].astype(np.int32)


In [11]:
ratings_new = ratings[ratings['ISBN'].isin(books['ISBN'])]

In [12]:
n_users = users.shape[0]
n_books = books.shape[0]
1.0-len(ratings_new)/float(n_users*n_books)

0.9999863734155897

In [13]:
ratings_implicit = ratings_new[ratings_new['Book-Rating'] == 0]
ratings_explicit = ratings_new[ratings_new['Book-Rating'] != 0]


In [116]:
counts1 = ratings_explicit['User-ID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['User-ID'].isin(counts1[counts1 >= 15].index)]
counts = ratings_explicit['Book-Rating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['Book-Rating'].isin(counts[counts >= 15].index)]

In [129]:
ratings_matrix = ratings_explicit.pivot(index='User-ID', columns='ISBN', values='Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix

(449, 66574)


ISBN,0000913154,0001046438,000104687X,0001047213,0001047973,000104799X,0001048082,0001053736,0001053744,0001055607,...,B000092Q0A,B00009EF82,B00009NDAN,B0000DYXID,B0000T6KHI,B0000VZEJQ,B0000X8HIE,B00013AX9E,B0001I1KOG,B000234N3A
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2033,,,,,,,,,,,...,,,,,,,,,,
2110,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,
4017,,,,,,,,,,,...,,,,,,,,,,
4385,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274061,,,,,,,,,,,...,,,,,,,,,,
274301,,,,,,,,,,,...,,,10.0,,,,,,,
275970,,,,,,,,,,,...,,,,,,,,,,
277427,,,,,,,,,,,...,,,,,,,,,,


In [131]:
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)

In [150]:
asd = ratings_matrix.loc[2033]
for i in range(len(list(asd))):
    if asd[i] != 0:
        print(asd[i], ratings_matrix.columns[i])

7 515
10 1189
10 1200
10 1203
10 2052
7 2402
6 2403
10 2595
8 2596
9 2597
7 2642
10 3822
8 4982
10 5624
7 5712
6 6224
10 6654
8 9401
9 10293
10 10419
10 10420
5 11520
10 11543
10 17030
10 17885
6 21842
9 23307
9 23350
9 23356
9 23462
7 24813
10 28910
10 28912
8 28928
10 28934
10 28956
10 28984
10 29010
10 29484
10 29749
6 31009
10 33643
8 34069
9 35085
2 35889
2 35893
10 37055
10 37088
10 37671
6 37708
10 38293
7 41610
8 41700
10 41753
7 42357
7 42603
7 42604
7 45302
10 45634
10 45643
8 46735
8 46857
9 49090
7 49109
10 49219
10 49236
10 49251
10 49273
10 49280
10 49337
8 49339
8 49340
8 49341
8 49348
8 49349
10 49353
10 49358
9 49359
10 49361
7 49378
10 49596
10 51235
9 51300
10 51318
10 51342
10 51351
9 53202
10 53288
5 53832
8 53994
10 54191
10 54192
10 54196
9 54197
10 54201
10 54215
10 54218
10 54230
5 54231
8 54236
9 54237
9 54252
8 54258
10 54262
8 54272
9 54288
9 54290
8 54300
8 54306
9 54308
10 54309
8 54321
8 54322
10 54326
10 54352
8 54515
9 54954
5 55051
6 55175
10 55608
10 

In [97]:
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()

    return similarities,indices

def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == item_loc:
            continue
        else:
            product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    
    #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings
    #which are handled here as below //code has been validated without the code snippet below, below snippet is to avoid negative
    #predictions which might arise in case of very sparse datasets when using correlation metric
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10

    print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))   
    
    return prediction

In [151]:
k=20
metric='correlation'
prediction = predict_itembased(2033,'63245',ratings_matrix)

KeyError: '63245'

IndexError: single positional indexer is out-of-bounds