In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, coo_matrix, identity, save_npz
from lightfm import LightFM
import joblib
from custom_precision_at_k import custom_precision_at_k
from pathlib import Path

In [11]:
data_df = pd.read_csv(Path('datasets/amazon/books_data.csv'))
ratings_df = pd.read_csv(Path('datasets/amazon/books_rating.csv'))

In [12]:
p_data_df = data_df[data_df['Title'].isna() == False].copy()

In [14]:
y = ratings_df[ratings_df['Title'].isna() == False]
y = y[y['User_id'].isna() == False]
y = y[['User_id','Title','review/score']]
y = y[y['Title'].isin(p_data_df['Title'])]
# y contains same review multiple times , e.g (User - Title - Review/Score) x 32 (max is 32 here)
y = y.drop_duplicates(subset=['User_id', 'Title'])

In [15]:
# This code is necessary because we are removing some rows from reviews resulting in less book titles than p_data_df
indices = np.where(p_data_df['Title'].isin(y['Title']))
p1_data_df = p_data_df.iloc[indices]

In [16]:
user_id_encoder = LabelEncoder()
title_encoder = LabelEncoder()
row_indices = user_id_encoder.fit_transform(y['User_id']) 
col_indices = title_encoder.fit_transform(y['Title']) 

y_csr = csr_matrix((y['review/score'], (row_indices, col_indices)))
save_npz('datasets/amazon/reviews_csr.npz', y_csr)

In [17]:
#sort item features and p2_data_df based on the positions of col_indices
sorted_indices = np.argsort(title_encoder.transform(p1_data_df['Title']))
p2_data_df = p1_data_df.iloc[sorted_indices]
assert((p2_data_df['Title'] == title_encoder.classes_).all())

p2_data_df.to_csv("datasets/amazon/books_data_reordered.csv", index=False)

In [8]:
r, c = (y_csr >= 4).nonzero()
r_train, r_test, c_train, c_test = train_test_split(r, c, test_size=1_000, random_state=42)
r_train, r_val, c_train, c_val = train_test_split(r_train, c_train, test_size=1_000, random_state=42)

y_train = coo_matrix((np.ones(len(r_train)), (r_train, c_train)), shape=y_csr.shape, dtype=int)
y_val = coo_matrix((np.ones(len(r_val)), (r_val, c_val)), shape=y_csr.shape, dtype=int)
y_test = coo_matrix((np.ones(len(r_test)), (r_test, c_test)), shape=y_csr.shape, dtype=int) 

In [None]:
user_features = identity(y_train.shape[0], format='csr')
model = LightFM(no_components=100, learning_rate=0.05, loss='warp', random_state=42)
model.fit(y_train, epochs=1, num_threads=120, verbose=True)
joblib.dump(model, 'models/amazon-book-reviews-no-item-features-model.pkl')
print("Precision without item features")

i = 1000
np.random.seed(42)
perm = np.random.permutation(y_train.shape[0])[:i]
user_features_perm = user_features[perm]
y_train_perm = csr_matrix(y_train)[perm]

train_precision = custom_precision_at_k(model, y_train_perm, user_features=user_features_perm, num_threads=1000)
print(f'Train precision : {train_precision.mean() * 100}%')

val_precision = custom_precision_at_k(model, y_val, y_train, user_features=user_features, num_threads=1000)
print(f'Val precision : {val_precision.mean() * 100}%')