In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import sklearn.metrics as metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [2]:
books = pd.read_csv('csv/BX-Books.csv', error_bad_lines=False, sep=';')
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)

users = pd.read_csv('csv/BX-Users.csv', error_bad_lines=False, sep=';')

ratings = pd.read_csv('csv/BX-Book-Ratings.csv', error_bad_lines=False, sep=';')


b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [3]:
books['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [4]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc', :]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...


In [5]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...


In [6]:
books.drop(books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc'].index, inplace = True )
books.drop(books.loc[books['Year-Of-Publication'] == 'Gallimard'].index, inplace = True )

In [7]:
books['Year-Of-Publication']=pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
books.loc[(books['Year-Of-Publication'] > 2021) | (books['Year-Of-Publication'] == 0),'Year-Of-Publication'] = np.NAN
books['Year-Of-Publication'].fillna(round(books['Year-Of-Publication'].mean()), inplace=True)
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int32)
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [8]:
ratings['Book-Rating'].unique()

array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

In [9]:
users.loc[(users['Age'] > 100) | (users['Age'] < 5), 'Age'] = np.nan
users['Age'] = users['Age'].fillna(users['Age'].mean())
users['Age'] = users['Age'].astype(np.int32)


In [10]:
ratings_new = ratings[ratings['ISBN'].isin(books['ISBN'])]

In [11]:
n_users = users.shape[0]
n_books = books.shape[0]
1.0-len(ratings_new)/float(n_users*n_books)

0.9999863733178015

In [12]:
ratings_implicit = ratings_new[ratings_new['Book-Rating'] == 0]
ratings_explicit = ratings_new[ratings_new['Book-Rating'] != 0]
print(len(ratings_new))
print(len(ratings_implicit))
print(len(ratings_explicit))

1031132
647291
383841


In [13]:
counts1 = ratings_explicit['User-ID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['User-ID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['Book-Rating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['Book-Rating'].isin(counts[counts >= 100].index)]

In [14]:
ratings_matrix = ratings_explicit.pivot(index='User-ID', columns='ISBN', values='Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix

(449, 66574)


ISBN,0000913154,0001046438,000104687X,0001047213,0001047973,000104799X,0001048082,0001053736,0001053744,0001055607,...,B000092Q0A,B00009EF82,B00009NDAN,B0000DYXID,B0000T6KHI,B0000VZEJQ,B0000X8HIE,B00013AX9E,B0001I1KOG,B000234N3A
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2033,,,,,,,,,,,...,,,,,,,,,,
2110,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,
4017,,,,,,,,,,,...,,,,,,,,,,
4385,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274061,,,,,,,,,,,...,,,,,,,,,,
274301,,,,,,,,,,,...,,,10.0,,,,,,,
275970,,,,,,,,,,,...,,,,,,,,,,
277427,,,,,,,,,,,...,,,,,,,,,,


In [15]:
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)
ratings_matrix.head()

ISBN,0000913154,0001046438,000104687X,0001047213,0001047973,000104799X,0001048082,0001053736,0001053744,0001055607,...,B000092Q0A,B00009EF82,B00009NDAN,B0000DYXID,B0000T6KHI,B0000VZEJQ,B0000X8HIE,B00013AX9E,B0001I1KOG,B000234N3A
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2276,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4385,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
k=20
metric='correlation'
def find_similar_items(item_id, ratings, metric=metric, neighbor_count=k):
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric)
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = neighbor_count+1)

    return 1-distances.flatten(), indices

def predict_itembased(user_id, item_id, ratings, metric = metric, neighbor_count=k):
    prediction= wtd_sum =0
    similarities, indices = find_similar_items(item_id, ratings, metric, neighbor_count)
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == ratings.columns.get_loc(item_id):
            continue
        else:
            product = ratings.iloc[ratings.index.get_loc(user_id),indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/np.sum(similarities)-1))

    prediction = 10 if prediction > 10 else prediction
    prediction = 1 if prediction <= 0 else prediction

    # print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))   
    
    return prediction

In [35]:
def recommend_item(user_id, ratings, metric=metric):
    prediction = []            
    for i in range(ratings.shape[1]):
        if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already
            prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))
    prediction = pd.Series(prediction)
    prediction = prediction.sort_values(ascending=False)
    recommended = prediction[:10]
    ret_value = []
    for i in range(len(recommended)):
            print ("{0}. {2} ====> {1}".format(i+1, books['Book-Title'][recommended.index[i]].encode('utf-8'), books['ISBN'][recommended.index[i]]))         
            ret_value.append(books['ISBN'][recommended.index[i]])
    return ret_value

In [36]:
my_recom = recommend_item(4385, ratings_matrix)

1. 3442446937 ====> b'Tage der Unschuld.'
2. 0440223571 ====> b'This Year It Will Be Different: And Other Stories'
3. 0061076031 ====> b'Mary-Kate &amp; Ashley Switching Goals (Mary-Kate and Ashley Starring in)'
4. 0553280333 ====> b'Getting Well Again'
5. 3498020862 ====> b'Die Korrekturen.'
6. 0439095026 ====> b"Tell Me This Isn't Happening"
7. 0689821166 ====> b'Flood : Mississippi 1927'
8. 0671623249 ====> b'LONESOME DOVE'
9. 0679810307 ====> b'Shabanu: Daughter of the Wind (Border Trilogy)'
10. 0679865691 ====> b'Haveli (Laurel Leaf Books)'


In [37]:
all_rates = ratings_explicit.loc[ratings_explicit['User-ID'] == 4385]
tp=0
tn=0
fp=0
fn=0
for index, row in all_rates.sort_values(by=['Book-Rating'], ascending=False).iterrows():
    if row['ISBN'] in my_recom:
        tp+=1
    else:
        fn+=1
    
print(tp, tn, fp, fn)
all_rates

0 0 0 212


Unnamed: 0,User-ID,ISBN,Book-Rating
21689,4385,0061083402,10
21691,4385,0061093343,9
21692,4385,0061096156,10
21694,4385,0312956762,10
21695,4385,0312980353,10
...,...,...,...
22013,4385,1551660016,10
22016,4385,155166531X,10
22017,4385,1551666014,10
22018,4385,1551667436,10


In [19]:
np.corrcoef(ratings_matrix)

array([[ 1.        , -0.00167007,  0.04681143, ..., -0.00197429,
        -0.00200219, -0.0016626 ],
       [-0.00167007,  1.        , -0.0020535 , ..., -0.00175832,
        -0.00178316, -0.00148072],
       [ 0.04681143, -0.0020535 ,  1.        , ..., -0.00242757,
        -0.00246187, -0.00204431],
       ...,
       [-0.00197429, -0.00175832, -0.00242757, ...,  1.        ,
         0.00452585, -0.00175045],
       [-0.00200219, -0.00178316, -0.00246187, ...,  0.00452585,
         1.        , -0.00177519],
       [-0.0016626 , -0.00148072, -0.00204431, ..., -0.00175045,
        -0.00177519,  1.        ]])

In [32]:
np.corrcoef(ratings_matrix.T)

MemoryError: Unable to allocate 33.0 GiB for an array with shape (66574, 66574) and data type float64

In [33]:
cosine_similarity(ratings_matrix)

array([[1.        , 0.        , 0.04900176, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04900176, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.00661989,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.00661989, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [34]:
cosine_similarity(ratings_matrix.T)

MemoryError: Unable to allocate 33.0 GiB for an array with shape (66574, 66574) and data type float64