In [30]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import arviz
import pystan
from scipy import sparse

# Own files
import utils 

# Loading data

In [31]:
DATA_DIR = 'ml-100k'
SM_NORMALNORMAL_NAME = 'sm_normalnormal.stan'

In [32]:
from importlib import reload
reload(utils)

df_users, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=200, subsample_top_items=100)
df_users[['user_id', 'item_id']] -= 1

In [33]:
unique_users = pd.value_counts(df_users.user_id, sort=False).index.values
unique_items = pd.value_counts(df_users.item_id, sort=False).index.values

num2user = {i:j for i,j in zip(range(len(unique_users)), unique_users)}
num2item = {i:j for i,j in zip(range(len(unique_items)), unique_items)}

user2num = {j:i for i,j in zip(range(len(unique_users)), unique_users)}
item2num = {j:i for i,j in zip(range(len(unique_items)), unique_items)}

df_num = df_users.copy()

df_num.user_id, df_num.item_id = df_num.user_id.map(user2num), df_num.item_id.map(item2num)

In [34]:
X = sparse.csc_matrix((df_num.rating, (df_num.user_id, df_num.item_id))).toarray()
X.shape

(200, 100)

In [35]:
len(np.nonzero(X)[0]) / np.prod((*X.shape,))

0.6844

In [37]:
df_users

Unnamed: 0,user_id,item_id,rating,timestamp
8,304,450,3,886324817
12,199,221,5,876042340
22,298,143,4,877881320
23,290,117,2,874833878
24,307,0,4,887736532
...,...,...,...,...
99951,129,120,5,876250746
99965,933,215,1,891191511
99982,278,63,1,875308510
99987,659,228,2,891406212


# Do the thing

In [28]:
sm_normalnormal_code = utils.get_stan_code(SM_NORMALNORMAL_NAME)
sm = utils.StanModel_cache(sm_normalnormal_code, 'normalnormal')

Using cached StanModel


In [29]:
data = dict(
    n_components = 2,
    n = X.shape[0],
    m = X.shape[1],
    X = X,
    mu_u = 2,
    sigma_u = 10,
    mu_v = 1,
    sigma_v = 10,
    sigma_x = 10
)

control = dict(
    max_treedepth=20
)

fit = sm.sampling(data, chains=4, n_jobs=4, iter=1000, control=control)

In [31]:
from sklearn.decomposition import PCA

In [43]:
X[0]

array([3, 0, 1, ..., 0, 0, 0], dtype=int64)

In [42]:
P = PCA(4)
F = P.fit_transform(X)
P.inverse_transform(F)[0]

array([ 3.00000000e+00, -2.19824159e-14,  1.00000000e+00, ...,
        3.33066907e-16,  3.33066907e-16,  6.66133815e-16])

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

# Test Place

In [None]:
sm_test_code = utils.get_stan_code('sanity.stan')
sm_test = utils.StanModel_cache(sm_test_code, 'test')

In [None]:
fit_test = sm_test.sampling(algorithm="Fixed_param", chains=4, n_jobs=-1, iter=10)

In [None]:
A = np.array([[0.674531,0.560879],[-1.82799,0.0132566]])

In [None]:
A@A