In [1]:
import numpy as np
seed = 42069
np.random.seed(seed)
import pandas as pd
from matplotlib import pyplot as plt
import arviz
import pystan
from scipy import sparse, stats
from typing import Iterable, Union, Callable
from sklearn.model_selection import train_test_split, ParameterGrid
import altair as alt
from time import time, sleep
from tqdm import tqdm
from multiprocessing import Pool

# Own files
import utils 
import StanClasses

# Load and preprocess data

The data we have is essentially a matrix, where the each row correspond to a person, and each column correspond to a movie. However, the matrix is very sparse and thus data is stored in sparse format (i.e. specified with indices and the corresponding values). 

In [2]:
# Define constants
DATA_DIR = 'ml-100k'

In [3]:
df, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=200, subsample_top_items=200)
df[['user_id', 'item_id']] -= 1

# We are not going to use timestamp, therefore drop it
df.drop('timestamp', axis='columns', inplace=True)

The user ids and item (movie) ids are essentially integer ranges, starting from and 1 to the number of users and items respectively. We don't have the all the unique ids when subsampling users and movies. It becomes problematic ... TODO: Write this

In [4]:
def column_relabler(df: pd.DataFrame, column: str):
    uniques = pd.value_counts(df[column], sort=False).index.values
    n_uniques = len(uniques)

    # Count from 1 to conform with Stan (Stan counts indexes arrays starting at 1)
    num2id = {num_:id_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    id2num = {id_:num_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    
    df[column] = df[column].map(id2num)
    return id2num, num2id

df_num = df.copy()
user2num, num2user = column_relabler(df_num, 'user_id')
item2num, num2item = column_relabler(df_num, 'item_id')

# p, q represents shape of the matrix as if it was dense
p, q = len(user2num), len(item2num)

In [5]:
df_train, df_valtest = train_test_split(df_num, test_size=0.1, random_state=seed)
df_val, df_test = train_test_split(df_valtest, test_size=0.5, random_state=seed)
del df_valtest

In [6]:
print(f'''Dataframe dimensions:

    df_train: {df_train.shape}
    df_val: {df_val.shape}
    df_test: {df_test.shape}''')

Dataframe dimensions:

    df_train: (20699, 3)
    df_val: (1150, 3)
    df_test: (1150, 3)


# Matrix Factorization 
Want to factorize the dense matrix $X_{n\times m} \approx U_{n\times k}V_{k\times m}$, where the subscripts denotes matrix shapes. The $k$ dimension denotes the user specified embedding dimension. We use different probabilistic models for the components. 


## Model 1: Normal
Not as simple as the simple model above, this is the analogous to the "regular" way when doing regression.

$$ U_{ij} \sim N(\mu_u, \sigma_u) $$
$$ V_{ij} \sim N(\mu_v, \sigma_v) $$
$$ X_{ij}\sim N((UV_{ij}), \beta)$$
$$ \beta \sim Gamma(a_\beta, b_\beta) $$

User defined variables:
$\mu_u, \sigma_u, \mu_v, \sigma_v, a_\beta, b_\beta$

## Model 2: Non-negative Matrix Factorization

$$ U_{ij}\sim Gamma(a_u, b_u) $$
$$ V_{ij}\sim Gamma(a_v, b_v) $$
$$ X_{ij}\sim Normal(UV_{ij}, \beta)$$
$$ \beta \sim Gamma(a_\beta, b_\beta) $$

User defined variables:
$a_u, b_u, a_v, b_v, a_\beta, b_\beta$

## Model 3: ARD

$$ U_{ij} \sim N(\mu_u, \alpha_j) $$
$$ V_{ij} \sim N(\mu_v, \alpha_j) $$
$$ X_{ij}\sim N((UV)_{ij}, \beta)$$
$$ \beta \sim Gamma(a_\beta, b_\beta) $$

$$ \alpha_{ij} \sim Gamma(a_\alpha, b_\alpha) $$

User defined variables:
$\mu_u, \mu_v, a_\alpha, b_\alpha, a_\beta, b_\beta$

2K samples, 1 chain, 5 thin

X_hat:            2min 30s, 2min 35, 2min 23s

Array of vectors: 4min 21s, 4min 19s

Matrix, no X_hat: 6min 14s, 6min 5s

In [None]:
models = [
    StanClasses.NormalFactorizer,
    StanClasses.NonNegativeFactorizer,
    StanClasses.ARD_Factorizer
]

init_kwargs = {'n_components':[5,10,15,20,25]}
static_kwargs = {'chains':1, 'iter':1000, 'control':{'max_treedepth':15}}

t0 = time()
hist = utils.fit_and_evaluate_models(
    models=models,
    X_train=df_train,
    X_val=df_val,
    candidate_kwargs=init_kwargs,
    static_kwargs=static_kwargs
)
evaltime = time()-t0

Fitting models:   0%|          | 0/15 [00:00<?, ?model/s]

Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel


To run all diagnostics call pystan.check_hmc_diagnostics(fit)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
To run all diagnostics call pystan.check_hmc_diagnostics(fit)
The relevant StanModel instance must be pi

In [None]:
df_hist = pd.DataFrame(hist)
# df_hist['model'] = df_hist['model'].map(lambda x: type(x).__name__)

df_hist.to_pickle('histpickle_withmodels.p')

In [None]:
# df_hist = pd.read_pickle('histpickle_withmodels.pkl')

In [None]:
def _plot_ci(self, n, P, lower_bounds, upper_bounds, ax, *args,
             **kwargs):
    '''
    Plots credible intervals
    '''
    means = P.mean(axis=0)

    if ax is None:
        ax = plt.gca()

    ax.errorbar(range(n), means,
                yerr=[means-lower_bounds, upper_bounds-means],
                fmt='o', *args, **kwargs)

def ci(self, n_elements: int=20, row_inds: Iterable=None, 
       col_inds: Iterable=None, n_samples: int=1000, p=0.95, plot: bool=False, 
       ax: 'matplotlib.Axes'=None, *args, **kwargs):
    '''
    Computes credible intervals first elements of matrix.

    Parameters
    ------------
    n_elements: Number of elements to calculte credible intervals for, 
                no effect if col_inds and row_inds are given.
    row_inds: Optional, which row indices in X to show CIs for

    col_inds: Optional, which column indices in X to show CIs for

    n_samples: Number of samples to sample from predictive distribution

    p: Optional, credible interval percentage, 0.95 by default

    plot: Optional, to plot credible intervals or not, False by default

    ax: Optional, plots on given ax, no effect if show is False

    Returns
    --------
    (lower_bounds, upper_bounds)
    '''
    self.assert_fitted()

    # This is equivalent to Xs = np.array([U@V for U,V in zip(Us, Vs)])
    Xs = self.Us@self.Vs

    if (row_inds is None) or (col_inds is None):
        assert (row_inds and col_inds) is None,\
            "Either row_inds and col_inds are both None, or both Iterables"
        # Used to extract first n_elements from predicted Xs
        row_inds, col_inds = np.unravel_index(range(n_elements), Xs.shape[1:])

    assert len(row_inds) == len(col_inds),\
        "Length mismatch between row_inds and col_inds"

    # Sample from predictive distribution
    picks = np.random.randint(0, len(Xs), n_samples)
    P = Xs[picks][:,row_inds, col_inds]
    P = self._likelihood_sample(P, picks)
    P.sort(axis=0)

    # Get credible intervals of samples from predictive distribution
    half_p = (1-p)/2
    lb = np.floor((half_p*n_samples)).astype(int)
    ub = np.ceil((p+half_p)*n_samples).astype(int)

    lower_bounds, upper_bounds = P[lb], P[ub]
    
    if plot:
        _plot_ci(self, len(row_inds), P, lower_bounds, upper_bounds, ax, *args,
                    **kwargs)

    return lower_bounds, upper_bounds

def ci_df(self, df):
    fig, ax = plt.subplots(figsize=(10,5))
    self.ci(row_inds=df.user_id, col_inds=df.item_id, plot=True, zorder=0, ax=ax, 
       c='firebrick', ecolor='goldenrod')
    ax.scatter(range(len(df)), df.rating, marker='x', c='orangered', zorder=1)
    plt.show()

ci_df(hist['model'][0], df_train.sort_values('rating'))

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

# Test Place

In [None]:
xrange = np.linspace(-0.5,4,1000)

a = 2
b = 3*a
y = stats.gamma.pdf(xrange, a=a, scale=1/b)
print(a, b)
plt.plot(xrange, y)
plt.show()

a = 2
b = 4*a
y = stats.gamma.pdf(xrange, a=a, scale=1/b)
print(a, b)
plt.plot(xrange, y)
plt.show()

a = 1
b = 0.08*a
y = stats.gamma.pdf(xrange, a=a, scale=1/b)
print(a, b)
plt.plot(xrange, y)
# plt.axvline(((a-1)/b/(a-1)))
plt.show()

In [None]:
raise ValueError

In [None]:
sm_test_code = utils.get_stan_code('sanity.stan')
sm_test = utils.StanModel_cache(sm_test_code, 'test')

In [None]:
fit_test = sm_test.sampling(algorithm="Fixed_param", chains=4, n_jobs=-1, iter=10)

In [None]:
A = np.array([[0.674531,0.560879],[-1.82799,0.0132566]])

In [None]:
A@A

In [None]:
X_ = 