In [1]:
import numpy as np
from scipy.sparse.linalg import svds
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
ratings = pd.read_csv('../Ratings.csv')
users = pd.read_csv('../Users.csv')
dtype_spec = {
    'ISBN': str,
    'Book-Title': str,
    'Book-Author': str,
    'Year-Of-Publication': str,
    'Publisher': str,
    'Image-URL-S': str,
    'Image-URL-M': str,
    'Image-URL-L': str,
    'Description': str,
    'Categories': str
}
books = pd.read_csv('../updated_books_progress.csv', dtype=dtype_spec, low_memory=False)


In [3]:
def preprocess_users(users):
    users['Age'].fillna(users['Age'].median(), inplace=True)
    users['Age'] = users['Age'].astype(int)
    users['Location'] = users['Location'].astype('category').cat.codes
    return users

def preprocess_books(books):
    books.fillna('', inplace=True)
    books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
    books['Year-Of-Publication'].fillna(books['Year-Of-Publication'].median(), inplace=True)
    books['Publisher'] = books['Publisher'].astype('category').cat.codes
    books['Book-Author'] = books['Book-Author'].astype('category').cat.codes
    return books

users = preprocess_users(users)
books = preprocess_books(books)

users.head(), books.head()

(   User-ID  Location  Age
 0        1     36772   32
 1        2     48830   18
 2        3     33483   32
 3        4     40562   17
 4        5     16902   32,
          ISBN                                         Book-Title  Book-Author  \
 0  0195153448                                Classical Mythology        65202   
 1  0002005018                                       Clara Callan        81481   
 2  0060973129                               Decision in Normandy        12671   
 3  0374157065  Flu: The Story of the Great Influenza Pandemic...        34304   
 4  0393045218                             The Mummies of Urumchi        25095   
 
    Year-Of-Publication  Publisher  \
 0               2002.0      10974   
 1               2001.0       6667   
 2               1991.0       6670   
 3               1999.0       5292   
 4               1999.0      15844   
 
                                          Image-URL-S  \
 0  http://images.amazon.com/images/P/0195153448.0...   

In [4]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

user_id_mapping = {id: idx for idx, id in enumerate(ratings['User-ID'].unique())}
book_id_mapping = {id: idx for idx, id in enumerate(ratings['ISBN'].unique())}

ratings['User-ID'] = ratings['User-ID'].map(user_id_mapping)
ratings['ISBN'] = ratings['ISBN'].map(book_id_mapping)
ratings.dropna(subset=['Book-Rating'], inplace=True)
all_user_ids = ratings['User-ID'].unique()
all_book_ids = ratings['ISBN'].unique()

train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

train_user_ids = set(train_data['User-ID'])
train_book_ids = set(train_data['ISBN'])
missing_users = set(all_user_ids) - train_user_ids
missing_books = set(all_book_ids) - train_book_ids

missing_data = ratings[ratings['User-ID'].isin(missing_users) | ratings['ISBN'].isin(missing_books)]
train_data = pd.concat([train_data, missing_data]).drop_duplicates()

n_users = ratings['User-ID'].nunique()
n_items = ratings['ISBN'].nunique()
train_matrix = csr_matrix((train_data['Book-Rating'], (train_data['User-ID'], train_data['ISBN'])), shape=(n_users, n_items))
test_matrix = csr_matrix((test_data['Book-Rating'], (test_data['User-ID'], test_data['ISBN'])), shape=(n_users, n_items))
print("-------matrix finished---------")

if np.any(np.isnan(train_matrix.data)):
    print("NaN values found in training matrix")
else:
    print("No NaN values in training matrix")


-------matrix finished---------
No NaN values in training matrix


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
users[['Age']] = scaler.fit_transform(users[['Age']])
books[['Year-Of-Publication']] = scaler.fit_transform(books[['Year-Of-Publication']])

In [6]:
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20, use_gpu=False, calculate_training_loss=True)

print("Training ALS model...")
als_model.fit(train_matrix.T, show_progress=True)
print("Model training completed.")

Training ALS model...


100%|██████████| 20/20 [01:51<00:00,  5.55s/it, loss=6.2e-5] 

Model training completed.





In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

class EmbeddingNet(nn.Module):
    # def __init__(self, num_users, num_books, embedding_dim, num_locations, num_authors, num_publishers):
    def __init__(self, num_users, num_books, embedding_dim, num_authors):
        super(EmbeddingNet, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.book_embedding = nn.Embedding(num_books, embedding_dim)
        # self.location_embedding = nn.Embedding(num_locations, embedding_dim)
        self.author_embedding = nn.Embedding(num_authors, embedding_dim)
        # self.publisher_embedding = nn.Embedding(num_publishers, embedding_dim)
        self.user_age = nn.Linear(1, embedding_dim)
        self.book_year = nn.Linear(1, embedding_dim)
    
    # def forward(self, user_id, book_id, location_id, age, author_id, year, publisher_id):
    def forward(self, user_id, book_id, age, author_id, year):
        user_embed = self.user_embedding(user_id).squeeze()
        book_embed = self.book_embedding(book_id).squeeze()
        # location_embed = self.location_embedding(location_id).squeeze()
        author_embed = self.author_embedding(author_id).squeeze()
        # publisher_embed = self.publisher_embedding(publisher_id).squeeze()
        age_embed = self.user_age(age).squeeze()
        year_embed = self.book_year(year).squeeze()
        # return torch.cat([user_embed, location_embed, age_embed, book_embed, author_embed, year_embed, publisher_embed], dim=-1)
        return torch.cat([user_embed, age_embed, book_embed, author_embed, year_embed], dim=-1)

In [8]:
embedding_dim = 10
num_users = len(user_id_mapping)
num_books = len(book_id_mapping)
num_locations = users['Location'].nunique()
num_authors = books['Book-Author'].nunique()
num_publishers = books['Publisher'].nunique()

# model = EmbeddingNet(num_users, num_books, embedding_dim, num_locations, num_authors, num_publishers)
model = EmbeddingNet(num_users, num_books, embedding_dim, num_authors)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

num_users, num_books, num_locations, num_authors, num_publishers

(105283, 340556, 57339, 102023, 16808)

In [9]:
print(len(train_data))

975277


In [26]:
num_epochs = 10
for epoch in range(num_epochs):
    print(f'epoch{epoch} starts:')
    model.train()
    epoch_loss = 0
    print(f'epoch{epoch} train finished:')
    for user_id, book_id, rating in zip(train_data['User-ID'], train_data['ISBN'], train_data['Book-Rating']):
        # if user_id not in user_id_mapping.values() or book_id not in book_id_mapping.values():
        #     continue
        # location_id = torch.tensor(users.loc[users['User-ID'] == user_id, 'Location'].values[0]).long()
        age = torch.tensor(users.loc[users['User-ID'] == user_id, 'Age'].values[0]).float().unsqueeze(0)
        author_id = torch.tensor(books.loc[books['ISBN'] == book_id, 'Book-Author'].values[0]).long()
        year = torch.tensor(books.loc[books['ISBN'] == book_id, 'Year-Of-Publication'].values[0]).float().unsqueeze(0)
        # publisher_id = torch.tensor(books.loc[books['ISBN'] == book_id, 'Publisher'].values[0]).long()
        
        user_id = torch.tensor(user_id).long()
        book_id = torch.tensor(book_id).long()
        
        # embedding = model(user_id, book_id, location_id, age, author_id, year, publisher_id)
        embedding = model(user_id, book_id, age, author_id, year)
        rating = torch.tensor(rating).float().unsqueeze(0)
        
        optimizer.zero_grad()
        output = embedding.dot(embedding)
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_data)}')


epoch0 starts:
epoch0 train finished:


KeyboardInterrupt: 