In [7]:
import numpy as np
from scipy.sparse.linalg import svds
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
ratings = pd.read_csv('../Ratings.csv')
users = pd.read_csv('../Users.csv')
dtype_spec = {
    'ISBN': str,
    'Book-Title': str,
    'Book-Author': str,
    'Year-Of-Publication': str,
    'Publisher': str,
    'Image-URL-S': str,
    'Image-URL-M': str,
    'Image-URL-L': str,
    'Description': str,
    'Categories': str
}
books = pd.read_csv('../updated_books_progress.csv', dtype=dtype_spec, low_memory=False)


In [3]:
ratings_head = ratings.head()
books_head = books.head()
users_head = users.head()

ratings_head, books_head, users_head

(   User-ID        ISBN  Book-Rating
 0   276725  034545104X            0
 1   276726  0155061224            5
 2   276727  0446520802            0
 3   276729  052165615X            3
 4   276729  0521795028            6,
          ISBN                                         Book-Title  \
 0  0195153448                                Classical Mythology   
 1  0002005018                                       Clara Callan   
 2  0060973129                               Decision in Normandy   
 3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
 4  0393045218                             The Mummies of Urumchi   
 
             Book-Author Year-Of-Publication                   Publisher  \
 0    Mark P. O. Morford                2002     Oxford University Press   
 1  Richard Bruce Wright                2001       HarperFlamingo Canada   
 2          Carlo D'Este                1991             HarperPerennial   
 3      Gina Bari Kolata                1999        Farra

In [4]:
def preprocess_users(users):
    users['Age'].fillna(users['Age'].median(), inplace=True)
    users['Age'] = users['Age'].astype(int)
    return users

def preprocess_books(books):
    books.fillna('', inplace=True)
    books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce').fillna(0).astype(int)
    return books


In [8]:
users = preprocess_users(users)
books = preprocess_books(books)

users.head(), books.head()

(   User-ID                            Location  Age
 0        1                  nyc, new york, usa   32
 1        2           stockton, california, usa   18
 2        3     moscow, yukon territory, russia   32
 3        4           porto, v.n.gaia, portugal   17
 4        5  farnborough, hants, united kingdom   32,
          ISBN                                         Book-Title  \
 0  0195153448                                Classical Mythology   
 1  0002005018                                       Clara Callan   
 2  0060973129                               Decision in Normandy   
 3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
 4  0393045218                             The Mummies of Urumchi   
 
             Book-Author  Year-Of-Publication                   Publisher  \
 0    Mark P. O. Morford                 2002     Oxford University Press   
 1  Richard Bruce Wright                 2001       HarperFlamingo Canada   
 2          Carlo D'Este          

In [9]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

user_id_mapping = {id: idx for idx, id in enumerate(ratings['User-ID'].unique())}
book_id_mapping = {id: idx for idx, id in enumerate(ratings['ISBN'].unique())}

ratings['User-ID'] = ratings['User-ID'].map(user_id_mapping)
ratings['ISBN'] = ratings['ISBN'].map(book_id_mapping)
ratings.dropna(subset=['Book-Rating'], inplace=True)
all_user_ids = ratings['User-ID'].unique()
all_book_ids = ratings['ISBN'].unique()

train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

train_user_ids = set(train_data['User-ID'])
train_book_ids = set(train_data['ISBN'])
missing_users = set(all_user_ids) - train_user_ids
missing_books = set(all_book_ids) - train_book_ids

missing_data = ratings[ratings['User-ID'].isin(missing_users) | ratings['ISBN'].isin(missing_books)]
train_data = pd.concat([train_data, missing_data]).drop_duplicates()

n_users = ratings['User-ID'].nunique()
n_items = ratings['ISBN'].nunique()
train_matrix = csr_matrix((train_data['Book-Rating'], (train_data['User-ID'], train_data['ISBN'])), shape=(n_users, n_items))
test_matrix = csr_matrix((test_data['Book-Rating'], (test_data['User-ID'], test_data['ISBN'])), shape=(n_users, n_items))
print("-------matrix finished---------")

if np.any(np.isnan(train_matrix.data)):
    print("NaN values found in training matrix")
else:
    print("No NaN values in training matrix")

-------matrix finished---------
No NaN values in training matrix


In [10]:
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20, use_gpu=False, calculate_training_loss=True)


In [11]:


als_model.fit(train_matrix.T, show_progress=True)
print("-----------------fitting finished-----------------")

100%|██████████| 20/20 [01:53<00:00,  5.68s/it, loss=6.2e-5] 

-----------------fitting finished-----------------





In [12]:
def evaluate_model(test_matrix, als_model):
    test_user_items = test_matrix.nonzero()
    predictions = []
    ground_truth = []
    for user, item in zip(test_user_items[0], test_user_items[1]):
        if user < als_model.user_factors.shape[0] and item < als_model.item_factors.shape[0]:
            prediction = als_model.user_factors[user, :].dot(als_model.item_factors[item, :].T)
            predictions.append(prediction)
            ground_truth.append(test_matrix[user, item])
    if len(predictions) == 0:
        return float('inf')  
    return np.sqrt(mean_squared_error(ground_truth, predictions))

In [13]:
rmse = evaluate_model(train_matrix, als_model)
print(f"Test RMSE: {rmse}")

Test RMSE: 7.844746002414569


In [14]:
def recommend_books_als(user_id, num_recommendations=5):
    if user_id not in user_id_mapping:
        return [] 
    user_index = user_id_mapping[user_id]
    user_ratings = train_matrix[user_index]
    recommended_books = als_model.recommend(user_index, user_ratings, N=num_recommendations, filter_already_liked_items=True)
    # recommended_book_ids = [list(book_id_mapping.keys())[list(book_id_mapping.values()).index(i)] for i, _ in recommended_books]
    # return books[books['ISBN'].isin(recommended_book_ids)]
    return recommended_books

In [15]:
print(test_data)

         User-ID    ISBN  Book-Rating
178554     15560   99291            0
533905     49582   59185            8
1091374    99796  121427            0
1036247    94309  320740            0
309523     28854   32411            0
...          ...     ...          ...
1146893   104997    3548            0
359252     33613   22124           10
461682     42867  157812            0
921902     85478   35302            9
461476     42867   18262            0

[229956 rows x 3 columns]


In [16]:
print(recommend_books_als(9713, 5))

(array([89434, 12730, 72327, 89335, 87658], dtype=int32), array([0.07200544, 0.05262126, 0.05110756, 0.04840216, 0.04637698],
      dtype=float32))
