In [6]:
from models.sansa import SANSA

sansa_config = {
       "l2": 20.0,
        "target_density": 5e-5,
        "ainv_params": {
            "umr_scans": 3,
            "umr_finetune_steps": 10,
            "umr_loss_threshold": 1e-4,
        },
        "ldlt_method": "icf"
}
     
sansa = SANSA.from_config(sansa_config)


In [7]:
# Load amazon books data

from datasets.amazonbook import Amazonbook

amazonbooks_data_config = {
    "name": "amazonbook",
    "rewrite": False,
}

amazonbooks_data = Amazonbook.from_config(amazonbooks_data_config)

2024-04-27 19:56:38,443 : [1/3] DATASET : Loading processed dataset datasets/data/amazonbook/dataset.parquet.


In [8]:
amazon_split_config = {
    "seed": 42,
    "val_target_proportion": 0.0,
}

(amazon_train, amazon_val, amazon_test), amazon_split_time = amazonbooks_data.create_splits(amazon_split_config)

2024-04-27 19:57:35,351 : [1/3] DATASET : Dataframe lengths | train_df: 2380730, val_df: 2380730, test_df: 2984108
2024-04-27 19:58:22,998 : [1/3] DATASET : Removing users [50736, 52234, 41589, 13647] from test inputs.
2024-04-27 19:58:23,631 : [1/3] DATASET : Splits information:
2024-04-27 19:58:23,632 : [1/3] DATASET : Train split info | n_users = 52643, n_items = 91599, n_ratings = 2380730, sparsity = 99.95%
2024-04-27 19:58:23,633 : [1/3] DATASET : Validation split info | n_users = 52643, n_items = 91599, n_ratings = 2380730, sparsity = 99.95%
2024-04-27 19:58:23,634 : [1/3] DATASET : Test split info | n_users = 52639, n_items = 91599, n_ratings = 2380661, sparsity = 99.95%
2024-04-27 19:58:23,635 : [1/3] DATASET : Execution of create_splits took at 99.545 seconds.


In [9]:
# Train Sansa
sansa.train(amazon_train)

2024-04-27 19:59:32,901 : [2/3] TRAINING : Train user-item matrix info | n_users = 52643, n_items = 91599, n_ratings = 2380730, sparsity = 99.95%
2024-04-27 19:59:32,903 : [2/3] TRAINING : Item-item matrix info | shape = (91599,91599)
2024-04-27 19:59:32,904 : [2/3] TRAINING : Training SANSA with L2=20.0, target density=0.005000%, LDL^T method=icf, approx. inverse method=umr...
2024-04-27 19:59:32,906 : [2/3] TRAINING : Loading item-user matrix...
2024-04-27 19:59:33,052 : [2/3] TRAINING : Constructing weights:
2024-04-27 19:59:42,984 : [2/3] TRAINING : Constructing A...
2024-04-27 19:59:45,343 : [2/3] TRAINING : A info | nnz: 330335853, size: 3964.4 MB
2024-04-27 19:59:59,701 : [2/3] TRAINING : Computing incomplete LL^T decomposition...
2024-04-27 20:00:30,668 : [2/3] TRAINING : L info | nnz: 419506, size: 5.400 MB, density: 0.005000%
2024-04-27 20:00:30,670 : [2/3] TRAINING : Scaling columns and creating D (LL^T -> L'DL'^T)
2024-04-27 20:00:30,684 : [2/3] TRAINING : Execution of ldlt

In [10]:
import pandas as pd

# Evaluate 
# Get all users
users = list(amazon_test.user_encoder.classes_)
# Get rated items of users
users_rated = amazon_test.get_rated_items(users)
targets = amazon_test.get_target_items(users)
target_ids_dict = (
    targets.groupby("user_id", group_keys=True)["item_id"]
    .apply(list)
    .to_dict()
)
keys = list(target_ids_dict.keys())
users_to_arange = {user: i for i, user in enumerate(keys)}
pd.options.mode.chained_assignment = None  # suppress irrelevant warning
users_rated["user_id"] = users_rated["user_id"].map(users_to_arange)
pd.options.mode.chained_assignment = "warn"
top_maxk_ids, top_maxk_scores = sansa.recommend(users_rated, 20)

2024-04-27 20:00:33,382 : [3/3] EVALUATION : Execution of _matmat took at 0.033 seconds.
2024-04-27 20:00:33,714 : [3/3] EVALUATION : Execution of _matmat took at 0.331 seconds.
2024-04-27 20:00:36,075 : [3/3] EVALUATION : Execution of _predict took at 2.726 seconds.


In [12]:
import array
import pandas as pd
import gzip

def readImageFeatures(path):
  f = open(path, 'rb')
  while True:
    asin = f.read(10)
    if asin == '': break
    a = array.array('f')
    a.fromfile(f, 4096)
    yield asin, a.tolist()

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

amazon_book_2014_data = getDF('datasets/metadata/amazonbook/meta_Books.json')

In [1]:
# Get original amazon ids so we can get metadata about book description
book_ids = []
custom_ids = []

with open("datasets/metadata/amazonbook/item_list.txt") as f:
    book_lines = f.read().splitlines()

for line in book_lines:
    entries = line.split()
    book_id = entries[:1]
    custom_id = entries[1:]
    book_ids += book_id
    custom_ids += custom_id


book_ids = book_ids[1:]

book_ids_map = {}

for i in range(len(book_ids)):
    book_ids_map[book_ids[i]] = i


FileNotFoundError: [Errno 2] No such file or directory: '/xtra/chan1846/item_list.txt'

In [32]:
# Test data has 91598 books which are all in 

amazon_book_2014_data_filtered = amazon_book_2014_data[amazon_book_2014_data['asin'].isin(book_ids)]

# Map the custom ids the authors use to asin
amazon_book_2014_data_filtered['custom_id'] = amazon_book_2014_data_filtered['asin'].map(book_ids_map)
amazon_book_2014_data_filtered.fillna('missing_description', inplace=True)
amazon_book_2014_data_filtered.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_book_2014_data_filtered['custom_id'] = amazon_book_2014_data_filtered['asin'].map(book_ids_map)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_book_2014_data_filtered.fillna('missing_description', inplace=True)


Unnamed: 0,asin,salesRank,imUrl,categories,title,description,price,related,brand,custom_id
13,0001055178,{'Books': 14149327},http://ecx.images-amazon.com/images/I/51ZSC6TK...,[[Books]],Master Georgie,Beryl Bainbridge seems drawn to disaster. Firs...,16.95,"{'also_viewed': ['0349116156', '0307947726', '...",missing_description,26186
63,000100039X,{'Books': 587803},http://ecx.images-amazon.com/images/I/81ZKLPiv...,[[Books]],The Prophet,"In a distant, timeless place, a mysterious pro...",3.99,"{'also_bought': ['1851686274', '0785830618', '...",missing_description,6379
169,0002005395,{'Books': 10681705},http://ecx.images-amazon.com/images/I/415HPT70...,[[Books]],Deafening,missing_description,9.39,"{'also_viewed': ['B00D9TM0WK', '1602861803', '...",missing_description,9371
190,0002051850,{'Books': 1100694},http://ecx.images-amazon.com/images/I/5122XJRJ...,[[Books]],For Whom the Bell Tolls,missing_description,8.99,"{'also_bought': ['0684801469', '0743297334', '...",missing_description,1180
193,0002113570,{'Books': 571745},http://ecx.images-amazon.com/images/I/51iMi0zY...,[[Books]],In the Shadow of Man,"""An instant animal classic."" --Time""Apart fr...",12.1,"{'also_bought': ['0395500818', 'B004X8W72O', '...",missing_description,27863
376,0002185385,{'Books': 1571618},http://ecx.images-amazon.com/images/I/41JB1K72...,[[Books]],Harvey Penick's Little Red Golf Book: Lessons ...,"Before titanium drivers, before oversized head...",10.99,"{'also_bought': ['0671612972', '0767903447', '...",missing_description,28930
377,000215725X,{'Books': 767068},http://ecx.images-amazon.com/images/I/51xVoZwH...,[[Books]],City of Djinns: A Year in Delhi,"Delhi has a richly layered past, and Dalrymple...",10.7,"{'also_bought': ['0307272826', '0002555107', '...",missing_description,1468
426,0002007770,{'Books': 2296729},http://ecx.images-amazon.com/images/I/511VYFJM...,[[Books]],Water For Elephants,missing_description,8.52,"{'also_bought': ['0399155341', '1573222453', '...",missing_description,6830
447,0002219417,{'Books': 4010051},http://ecx.images-amazon.com/images/I/41qvc1Xm...,[[Books]],The Winds of War,Herman Wouk&#x2019;s acclaimed novels include ...,4.99,"{'also_bought': ['0316955019', 'B0001NBNGQ', '...",missing_description,7738
480,0002226901,{'Books': 2857031},http://ecx.images-amazon.com/images/I/51cqfx8f...,[[Books]],The Partisans,'A magnificent storyteller' Sunday Mirror 'The...,6.99,"{'also_bought': ['B0059I6O44', '0002215470', '...",missing_description,34840


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words='english', min_df=0.0005)  # Initialize a TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform(amazon_book_2014_data_filtered['description'])

feature_names = vectorizer.get_feature_names_out()  # Get the vocabulary 

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                        columns=feature_names, 
                        index=amazon_book_2014_data_filtered['custom_id'])

print(tfidf_df)

            00  000   01   02   03   04   05   09   10  100  ...  zeal  \
custom_id                                                    ...         
26186      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
6379       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
9371       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
1180       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
27863      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
...        ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   
91076      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
91077      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
91360      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
85676      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
89218      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   

           zealand  zen  zero  zest  

In [34]:
tfidf_df.describe()

Unnamed: 0,00,000,01,02,03,04,05,09,10,100,...,zeal,zealand,zen,zero,zest,zoe,zombie,zombies,zone,zoo
count,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,...,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0,91599.0
mean,0.000111,0.001431,0.000341,0.000168,0.000128,0.000111,0.000115,8.6e-05,0.001815,0.000885,...,8.1e-05,0.00025,0.0002,0.000234,0.000103,0.000291,0.00069,0.00038,0.000329,0.000134
std,0.005307,0.013347,0.011744,0.007684,0.006505,0.005387,0.005979,0.004534,0.014437,0.011237,...,0.003077,0.007704,0.008068,0.006103,0.004068,0.01015,0.01381,0.008645,0.007125,0.005417
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.501399,0.457315,0.929752,0.886885,0.810061,0.760539,0.68394,0.539392,1.0,0.463837,...,0.328551,0.689819,0.741508,0.559063,0.383457,0.638328,0.62755,0.468269,0.476611,0.502877


In [50]:
top_maxk_ids_list = top_maxk_ids.tolist()

In [52]:
print(top_maxk_ids_list[0])

[7540, 138, 294, 537, 22060, 428, 420, 373, 1427, 670, 12395, 76, 429, 5595, 114, 2374, 7202, 56, 7807, 374]


In [53]:
import recmetrics

amazon_book_ils = recmetrics.intra_list_similarity(top_maxk_ids_list, tfidf_df)

print(amazon_book_ils)

0.11840303495146995
