In [107]:
from datetime import date

import pandas as pd
from scipy.sparse import coo_matrix

In [51]:
MIN_USERS_OVERLAP_COEFF = 0.5 # The minimum percentage of co-read books between current_user and overlapped_user to be used to create a recommendation
MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF = 2 # The minimum number of books read and rated by current_user before the MIN_USERS_OVERLAP_COEFF factor will be used to filter overlapped_users

## Read and check datasets

In [2]:
books_df = pd.read_csv('../data/processed_data/Books.csv')
users_df = pd.read_csv('../data/processed_data/Users.csv')
ratings_df = pd.read_csv('../data/processed_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271360, 8)
users_df.shape = (278858, 3)
ratings_df.shape = (1149780, 3)


In [3]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

# author - 0, image-url-l - 0
missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,0,0.0


In [6]:
# Should be zero records
books_df[books_df['publication_year'] > date.today().year]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l


In [7]:
books_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [8]:
users_df.head(3)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [10]:
ratings_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


## Find users with similar book preferences as the current user

In [15]:
books_user_like = [
    '0590353403',   # Harry Potter and the Sorcerer's Stone (Book 1)
    '0439064872',   # Harry Potter and the Chamber of Secrets (Book 2)
    '0345350499'    # The Mists of Avalon
]

books_user_like_titles = [
    "Harry Potter and the Sorcerer's Stone (Book 1)", 
    "Harry Potter and the Chamber of Secrets (Book 2)",
    "The Mists of Avalon"
]

In [99]:
current_user_id = -1

books_user_like_dict = {
    'user_id': [current_user_id for _ in range(len(books_user_like))],
    'isbn': books_user_like,
    'title': books_user_like_titles,
    'rating': [10, 9, 10]
}

books_user_like_df = pd.DataFrame.from_dict(books_user_like_dict)
books_user_like_df

Unnamed: 0,user_id,isbn,title,rating
0,-1,590353403,Harry Potter and the Sorcerer's Stone (Book 1),10
1,-1,439064872,Harry Potter and the Chamber of Secrets (Book 2),9
2,-1,345350499,The Mists of Avalon,10


In [100]:
books_user_like_set = set(books_user_like_df['isbn'])   # Because theoretically, a user could give several ratings to a book with the same title
print(f"books_user_like_set = {books_user_like_set}")

books_user_like_set = {'0439064872', '0590353403', '0345350499'}


In [29]:
# Users who read the same books as the current_user
overlap_users = {}

for index, row in ratings_df.iterrows():
    user_id = row['user_id']
    book_isbn = row['isbn']
    if book_isbn in books_user_like_set:
        if user_id not in overlap_users:
            overlap_users[user_id] = 1
        else:
            overlap_users[user_id] += 1 # Track how many books read and rated by current_user have been read and rated by the current user in a loop

print(f"len(overlap_users) = {len(overlap_users)}")

len(overlap_users) = 664


In [35]:
# To check the results
overlap_user_ids_list = list(overlap_users.keys())
print(f"len(overlap_user_ids_list) = {len(overlap_user_ids_list)}")
print(f"overlap_user_ids_list[:10] = {overlap_user_ids_list[:10]}")

len(overlap_user_ids_list) = 664
overlap_user_ids_list[:10] = [277427, 278356, 278582, 254, 1725, 2033, 2179, 2337, 2793, 2855]


In [43]:
# Check the results for one user
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] == 277427 and row['isbn'] in books_user_like_set, axis=1)
)
ratings_df[result]

Unnamed: 0,user_id,isbn,rating
1629,277427,439064872,0


In [44]:
# Check the results for 5 users
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in overlap_user_ids_list[:5] and row['isbn'] in books_user_like_set, axis=1)
)
ratings_df[result]

Unnamed: 0,user_id,isbn,rating
1629,277427,439064872,0
4257,278356,439064872,8
9138,278582,345350499,10
10106,254,345350499,0
10209,254,439064872,9
10279,254,590353403,9
13327,1725,345350499,0


In [83]:
""" 
    Filter users, leaving only those who have read all the books of current_user 
        (if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF) 
    or MIN_USERS_OVERLAP_COEFF a part of the books read by current_user 
        (if len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF)
"""

filtered_overlap_user_ids_set = None
if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 
    print(f"len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} <= {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = len(books_user_like)
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] >= books_thresh])
else:
    print(f"len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} > {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = int(len(books_user_like) * MIN_USERS_OVERLAP_COEFF)
    print(f"books_thresh = {books_thresh}")
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] > books_thresh])

print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")

len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 3 > 2
books_thresh = 1
len(filtered_overlap_user_ids_set) = 32


### Check the results

In [84]:
# To check the results
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")
print(f"list(filtered_overlap_user_ids_set)[:10] = {list(filtered_overlap_user_ids_set)[:10]}")

len(filtered_overlap_user_ids_set) = 32
list(filtered_overlap_user_ids_set)[:10] = [148744, 131594, 30735, 240144, 272786, 226965, 21014, 11676, 252829, 36256]


In [85]:
# Check the results
filtered_ratings_df = ratings_df[result]
filtered_ratings_df = (
    filtered_ratings_df
    .groupby('user_id')
    .count()
    .reset_index()
    .rename(columns={'rating': 'ratings_no'})
    .sort_values(by='ratings_no')
)

print(f"len(filtered_ratings_df) = {len(filtered_ratings_df)}")

filtered_ratings_df

len(filtered_ratings_df) = 32


Unnamed: 0,user_id,isbn,ratings_no
15,100906,2,2
29,240144,2,2
28,238120,2,2
27,235842,2,2
26,229741,2,2
25,226965,2,2
24,225996,2,2
23,211426,2,2
22,190448,2,2
20,150124,2,2


## Find similar user book ratings

In [88]:
# For filtered users, user_id, book_isbn and rating will be stored in interactions_list
interactions_arr = ratings_df[(
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in filtered_overlap_user_ids_set and row['isbn'] in books_user_like_set, axis=1)
)].values

print(f"len(interactions_arr) = {len(interactions_arr)}")

len(interactions_arr) = 68


In [89]:
interactions_arr[:3]

array([[254, '0345350499', 0],
       [254, '0439064872', 9],
       [254, '0590353403', 9]], dtype=object)

In [91]:
# Check the correctness
print(f"sum(filtered_ratings_df['ratings_no'].values) = {sum(filtered_ratings_df['ratings_no'].values)})")
print(f"len(interactions_arr) = {len(interactions_arr)}")

sum(filtered_ratings_df['ratings_no'].values) = 68)
len(interactions_arr) = 68


The results show that the previous operation successfully selected all records from filtered_overlap_user_ids_set and their ratings for books from books_user_like_set.  
However, in order to make recommendations, users in filtered_overlap_user_ids_set need to select **all the books they have read**, **not just books in books_user_like_set** (since the task is to recommend new books to the user (books they haven't read yet)).

In [94]:
# For filtered users, user_id, book_isbn and rating will be stored in interactions_list
interactions_arr = ratings_df[(
    ratings_df['user_id']
    .apply(lambda user_id: user_id in filtered_overlap_user_ids_set) # Maybe also filter: isbn not in books_user_like_set - no need, because it will be used later to find the users with the same book tastes as current_user 
)].values

print(f"len(interactions_arr) = {len(interactions_arr)}")
print(f"interactions_arr[:5] = {interactions_arr[:5]}")

len(interactions_arr) = 35418
interactions_arr[:5] = [[254 '006000438X' 0]
 [254 '0060013117' 0]
 [254 '0060199563' 0]
 [254 '0060391448' 0]
 [254 '0060502320' 7]]


In [97]:
interactions_df = pd.DataFrame(data=interactions_arr, columns=['user_id', 'isbn', 'rating'])

# Check the results
print(f"len(interactions_df) = {len(interactions_df)}")
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")

len(interactions_df) = 35418
interactions_df['user_id'].nunique() = 32


In [98]:
interactions_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,254,006000438X,0
1,254,0060013117,0
2,254,0060199563,0


## Create a user/book matrix

In [102]:
interactions_df = pd.concat([books_user_like_df[['user_id', 'isbn', 'rating']], interactions_df])
interactions_df

Unnamed: 0,user_id,isbn,rating
0,-1,0590353403,10
1,-1,0439064872,9
2,-1,0345350499,10
0,254,006000438X,0
1,254,0060013117,0
...,...,...,...
35413,272786,0446672211,8
35414,272786,0465017606,0
35415,272786,059035342X,9
35416,272786,0679723161,0


In [103]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35421 entries, 0 to 35417
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  35421 non-null  object
 1   isbn     35421 non-null  object
 2   rating   35421 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [105]:
interactions_df['user_id'] = pd.to_numeric(interactions_df['user_id'])
interactions_df['rating'] = pd.to_numeric(interactions_df['rating'])
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35421 entries, 0 to 35417
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  35421 non-null  int64 
 1   isbn     35421 non-null  object
 2   rating   35421 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [106]:
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")
print(f"interactions_df['isbn'].nunique() = {interactions_df['isbn'].nunique()}")

interactions_df['user_id'].nunique() = 33
interactions_df['isbn'].nunique() = 28928


The obtained results show that there are far fewer users than books (this is logical, because different users could read different books). This indicates the need to use a **sparse matrix to save memory**.

In [109]:
interactions_df['user_index'] = interactions_df['user_id'].astype('category').cat.codes
interactions_df['book_index'] = interactions_df['isbn'].astype('category').cat.codes

# To check the correctness of creating cols 'user_index' and 'book_index'
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")
print(f"interactions_df['user_index'].nunique() = {interactions_df['user_index'].nunique()}\n")

print(f"interactions_df['isbn'].nunique() = {interactions_df['isbn'].nunique()}")
print(f"interactions_df['book_index'].nunique() = {interactions_df['book_index'].nunique()}\n")

interactions_df.head(3)

interactions_df['user_id'].nunique() = 33
interactions_df['user_index'].nunique() = 33

interactions_df['isbn'].nunique() = 28928
interactions_df['book_index'].nunique() = 28928



Unnamed: 0,user_id,isbn,rating,user_index,book_index
0,-1,590353403,10,0,15720
1,-1,439064872,9,0,10025
2,-1,345350499,10,0,4603


In [110]:
ratings_matrix_coo = coo_matrix((interactions_df['rating'], (interactions_df['user_index'], interactions_df['book_index'])))
ratings_matrix_coo

<33x28928 sparse matrix of type '<class 'numpy.int64'>'
	with 35421 stored elements in COOrdinate format>

In [111]:
ratings_matrix = ratings_matrix_coo.tocsr()