In [16]:
import numpy as np
seed = 42069
np.random.seed(seed)
import pandas as pd
from matplotlib import pyplot as plt
import arviz
import pystan
from scipy import sparse, stats
from typing import Iterable, Union, Callable
from sklearn.model_selection import train_test_split, ParameterGrid
import altair as alt
from time import time, sleep
from tqdm import tqdm

# Own files
import utils 
import StanClasses

# Load and preprocess data

The data we have is essentially a matrix, where the each row correspond to a person, and each column correspond to a movie. However, the matrix is very sparse and thus data is stored in sparse format (i.e. specified with indices and the corresponding values). 

In [5]:
# Define constants
DATA_DIR = 'ml-100k'

In [6]:
df, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=150, subsample_top_items=20)
df[['user_id', 'item_id']] -= 1

# We are not going to use timestamp, therefore drop it
df.drop('timestamp', axis='columns', inplace=True)

The user ids and item (movie) ids are essentially integer ranges, starting from and 1 to the number of users and items respectively. We don't have the all the unique ids when subsampling users and movies. It becomes problematic ... TODO: Write this

In [7]:
def column_relabler(df: pd.DataFrame, column: str):
    uniques = pd.value_counts(df[column], sort=False).index.values
    n_uniques = len(uniques)

    # Count from 1 to conform with Stan (Stan counts indexes arrays starting at 1)
    num2id = {num_:id_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    id2num = {id_:num_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    
    df[column] = df[column].map(id2num)
    return id2num, num2id

df_num = df.copy()
user2num, num2user = column_relabler(df_num, 'user_id')
item2num, num2item = column_relabler(df_num, 'item_id')

# p, q represents shape of the matrix as if it was dense
p, q = len(user2num), len(item2num)

In [8]:
df_train, df_valtest = train_test_split(df_num, test_size=0.1)
df_val, df_test = train_test_split(df_valtest, test_size=0.5)
del df_valtest

# Matrix Factorization 
Want to factorize the dense matrix $X_{n\times m} \approx U_{n\times k}V_{k\times m}$, where the subscripts denotes matrix shapes. The $k$ dimension denotes the user specified embedding dimension. We use different probabilistic models for the components. 


## Model 1: Simple
Ultra simple

$$ U_{ij} \sim N(\mu_u, \sigma_u) $$
$$ V_{ij} \sim N(\mu_v, \sigma_v) $$
$$ X_{ij}\sim N((UV_{ij}), \sigma_x)$$

User defined variables:
$\mu_u, \sigma_u, \mu_v, \sigma_v, \sigma_x$

## Model 2: Non-negative Matrix Factorization

$$ U_{ij}\sim Gamma(a_u, b_u) $$
$$ V_{ij}\sim Gamma(a_v, b_v) $$
$$ X_{ij}\sim Normal(UV_{ij}, \beta)$$
$$ \beta \sim Gamma(a_\beta, b_\beta) $$

User defined variables:
$a_u, b_u, a_v, b_v, a_\beta, b_\beta$

## Model 3: ARD

$$ U_{ij} \sim N(\mu_u, \alpha_j) $$
$$ V_{ij} \sim N(\mu_v, \alpha_j) $$
$$ X_{ij}\sim N((UV)_{ij}, \beta)$$
$$ \beta \sim Gamma(a_\beta, b_\beta) $$

$$ \alpha_{ij} \sim Gamma(a_\alpha, b_\alpha) $$

User defined variables:
$\mu_u, \mu_v, a_\alpha, b_\alpha, a_\beta, b_\beta$

2K samples, 1 chain, 5 thin

X_hat:            2min 30s, 2min 35, 2min 23s

Array of vectors: 4min 21s, 4min 19s

Matrix, no X_hat: 6min 14s, 6min 5s

In [21]:
def fit_and_evaluate(model: 'StanFactorizer', init_kwargs: dict, X_train, X_val=None):
    model_object = model(**init_kwargs)
 
    t0 = time()
    model_object.fit(X_train)
    fit_time = time()-t0
    
    train_mae = model_object.mae(X_train)
    
    if df_val is not None:
        val_mae = model_object.mae(X_val)   
    else:
        val_mae = None
        
    return model_object, fit_time, train_mae, val_mae

def fit_and_evaluate_models(models: Iterable, X_train, X_val=None, candidate_kwargs: dict={},
                            static_kwargs: dict={}, verbose=True):
        
    hist = {'model':[], 'params':[], 'fit_time':[], 'train_mae':[], 'val_mae':[]}
    
    param_gen = tqdm(ParameterGrid({'model':models, **candidate_kwargs}), 
                     desc='Fitting models', disable=not verbose, unit='model', position=0)
    
    for paramdict in param_gen:
        model = paramdict.pop('model')
        
        hist['params'].append(paramdict)
        
        paramdict = paramdict.copy()
        paramdict.update(static_kwargs)

        model_object, fit_time, train_mae, val_mae = fit_and_evaluate(
            model=model,
            init_kwargs=paramdict,
            X_train=X_train,
            X_val=X_val,
        )
        
        hist['model'].append(model_object)
        hist['fit_time'].append(fit_time)
        hist['train_mae'].append(train_mae)
        hist['val_mae'].append(val_mae)
        
    return hist
    
models = [
    StanClasses.SimpleFactorizer,
    StanClasses.NonNegativeFactorizer,
    StanClasses.ARD_Factorizer
]

init_kwargs = {'n_components':[5,10,15,20]}
    
static_kwargs = {
    'chains':1, 
    'n_jobs':1, 
    'iter':1000, 
    'thin':5, 
    'control':{'max_treedepth':20}
}

hist = fit_and_evaluate_models(
    models=models,
    X_train=df_train,
    X_val=df_val,
    candidate_kwargs=init_kwargs,
    static_kwargs=static_kwargs
)

Fitting models:   8%|▊         | 1/12 [01:15<13:52, 75.66s/model]

Using cached StanModel
Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Fitting models:  25%|██▌       | 3/12 [05:06<14:31, 96.82s/model]

Using cached StanModel
Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Fitting models:  42%|████▏     | 5/12 [08:20<10:35, 90.80s/model] 

Using cached StanModel
Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Fitting models:  58%|█████▊    | 7/12 [10:42<06:46, 81.32s/model]

Using cached StanModel
Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Fitting models:  75%|███████▌  | 9/12 [13:32<04:08, 82.90s/model]

Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)


Using cached StanModel
Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
Fitting models: 100%|██████████| 12/12 [24:43<00:00, 123.62s/model]

Using cached StanModel





In [39]:
df_hist = pd.DataFrame(hist)
df_hist['model'] = df_hist['model'].map(lambda x: type(x).__name__)

df_hist.to_pickle('histpickle.p')

In [None]:
from importlib import reload
_ = reload(utils)
_ = reload(StanClasses)

sm_nmf = StanClasses.NonNegativeFactorizer(10)

t0 = time()
sm_nmf.fit(df_train, chains=1, n_jobs=1, iter=1000, thin=5, control={'max_treedepth':20})
print(time()-t0)

In [None]:
sm_simple = StanClasses.SimpleFactorizer(10)

t0 = time()
sm_simple.fit(df_train, chains=1, n_jobs=-1, iter=1000, thin=5, control={'max_treedepth':20})
print(time()-t0)

In [None]:
fig, axes = plt.subplots(figsize=(7,2))
plt.grid()
sm_simple.ci(show=True, ax=axes, label='Simple', zorder=2)
sm_nmf.ci(show=True, ax=axes, label='NMF', zorder=3)
plt.xticks(np.arange(20))
plt.legend()

In [None]:
print(sm_simple.mae(df_train))
print(sm_simple.mae(df_val))
print(sm_simple.mae(df_test))

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

# Test Place

In [None]:
xrange = np.linspace(-0.5,4,1000)

a = 2
b = 4*a
y = stats.gamma.pdf(xrange, a=a, scale=1/b)
print(a, b)
plt.plot(xrange, y)
plt.show()

a = 1
b = 0.08*a
y = stats.gamma.pdf(xrange, a=a, scale=1/b)
print(a, b)
plt.plot(xrange, y)
# plt.axvline(((a-1)/b/(a-1)))
plt.show()

In [None]:
raise ValueError

In [None]:
sm_test_code = utils.get_stan_code('sanity.stan')
sm_test = utils.StanModel_cache(sm_test_code, 'test')

In [None]:
fit_test = sm_test.sampling(algorithm="Fixed_param", chains=4, n_jobs=-1, iter=10)

In [None]:
A = np.array([[0.674531,0.560879],[-1.82799,0.0132566]])

In [None]:
A@A

In [None]:
X_ = 