In [1]:
import numpy as np
seed = 42069
np.random.seed(seed)
import pandas as pd
from matplotlib import pyplot as plt
import arviz
import pystan
from scipy import sparse, stats
from typing import Iterable, Union, Callable
from sklearn.model_selection import train_test_split, ParameterGrid
import altair as alt
from time import time, sleep
from tqdm import tqdm

# Own files
import utils 
import StanClasses

In [2]:
# Define constants
DATA_DIR = 'ml-100k'

In [3]:
df, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=50, subsample_top_items=20)
df[['user_id', 'item_id']] -= 1

# We are not going to use timestamp, therefore drop it
df.drop('timestamp', axis='columns', inplace=True)

In [4]:
def column_relabler(df: pd.DataFrame, column: str):
    uniques = pd.value_counts(df[column], sort=False).index.values
    n_uniques = len(uniques)

    num2id = {num_:id_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    id2num = {id_:num_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    
    df[column] = df[column].map(id2num)
    return id2num, num2id

df_num = df.copy()
user2num, num2user = column_relabler(df_num, 'user_id')
item2num, num2item = column_relabler(df_num, 'item_id')

# p, q represents shape of the matrix as if it was dense
p, q = len(user2num), len(item2num)

In [5]:
df_train, df_val = train_test_split(df_num, test_size=0.1, random_state=seed)

In [6]:
print('test')

test


In [7]:
models = [
    StanClasses.NormalFactorizer,
    StanClasses.NonNegativeFactorizer,
    StanClasses.ARD_Factorizer
]

init_kwargs = {'n_components':[1,2,3,4,5]}
static_kwargs = {'chains':1, 'iter':1000, 'control':{'max_treedepth':15}}

t0 = time()
hist = utils.fit_and_evaluate_models(
    models=models,
    X_train=df_train,
    X_val=df_val,
    candidate_kwargs=init_kwargs,
    static_kwargs=static_kwargs
)
evaltime = time()-t0

Fitting models:   0%|          | 0/15 [00:00<?, ?model/s]

Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


Using cached StanModel


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


Using cached StanModel


Fitting models:  13%|█▎        | 2/15 [00:42<03:10, 14.63s/model]

Using cached StanModel


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


Using cached StanModel


Fitting models:  27%|██▋       | 4/15 [00:57<02:04, 11.32s/model]

Using cached StanModel


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


Using cached StanModel


Fitting models:  40%|████      | 6/15 [01:13<01:29,  9.99s/model]

Using cached StanModel




Using cached StanModel




Using cached StanModel




Using cached StanModel




Using cached StanModel


Fitting models: 100%|██████████| 15/15 [02:22<00:00,  9.52s/model]


In [8]:
df_hist = pd.DataFrame(hist)

In [9]:
df_latex = df_hist.copy()
df_latex['k'] = df_latex['model'].apply(lambda x: x.n_components)
df_latex['model'] = df_latex['model'].apply(lambda x: type(x).__name__)
df_latex

Unnamed: 0,model,params,fit_time,train_mae,val_mae,k
0,NormalFactorizer,{'n_components': 1},4.349305,0.594971,0.70754,1
1,NormalFactorizer,{'n_components': 2},42.148864,0.561117,0.73133,2
2,NonNegativeFactorizer,{'n_components': 1},4.434446,0.594616,0.709152,1
3,NormalFactorizer,{'n_components': 3},57.37672,0.524337,0.745389,3
4,NonNegativeFactorizer,{'n_components': 2},14.613461,0.562867,0.743365,2
5,NormalFactorizer,{'n_components': 4},73.408981,0.504713,0.795345,4
6,NonNegativeFactorizer,{'n_components': 3},17.827789,0.528855,0.748551,3
7,ARD_Factorizer,{'n_components': 1},6.763555,0.595341,0.707628,1
8,NonNegativeFactorizer,{'n_components': 4},24.370525,0.51155,0.781266,4
9,NormalFactorizer,{'n_components': 5},97.735129,0.48669,0.810384,5


In [17]:
df_latex.sort_values('model')

Unnamed: 0,model,params,fit_time,train_mae,val_mae,k
7,ARD_Factorizer,{'n_components': 1},6.763555,0.595341,0.707628,1
11,ARD_Factorizer,{'n_components': 2},34.78427,0.563698,0.724609,2
12,ARD_Factorizer,{'n_components': 3},34.504919,0.531398,0.725559,3
13,ARD_Factorizer,{'n_components': 4},35.005784,0.523318,0.735387,4
14,ARD_Factorizer,{'n_components': 5},33.575886,0.518658,0.74121,5
2,NonNegativeFactorizer,{'n_components': 1},4.434446,0.594616,0.709152,1
4,NonNegativeFactorizer,{'n_components': 2},14.613461,0.562867,0.743365,2
6,NonNegativeFactorizer,{'n_components': 3},17.827789,0.528855,0.748551,3
8,NonNegativeFactorizer,{'n_components': 4},24.370525,0.51155,0.781266,4
10,NonNegativeFactorizer,{'n_components': 5},35.234457,0.502775,0.78381,5


In [10]:
best_model = df_hist['model'].values[0]
best_params = df_hist['params'].values[0].copy()
best_params.update(static_kwargs)

In [11]:
df_full, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=60, subsample_top_items=30)
df_full[['user_id', 'item_id']] -= 1

user2num, num2user = column_relabler(df_full, 'user_id')
item2num, num2item = column_relabler(df_full, 'item_id')

# We are not going to use timestamp, therefore drop it
df_full.drop('timestamp', axis='columns', inplace=True)

df_full_train, df_full_val = train_test_split(df_full, test_size=0.05, random_state=seed)

In [12]:
print(df_full.shape)
print(df_full_train.shape)
print(df_full_val.shape)

(1639, 3)
(1557, 3)
(82, 3)


In [13]:
final_model_object, fit_time, train_mae, val_mae =\
    utils.fit_and_evaluate((type(best_model), best_params, df_full_train, df_full_val))

Using cached StanModel
