In [1]:
import numpy as np
seed = 42069
np.random.seed(seed)
import pandas as pd
from matplotlib import pyplot as plt
import arviz
import pystan
from scipy import sparse, stats
from typing import Iterable, Union, Callable
from sklearn.model_selection import train_test_split, ParameterGrid
import altair as alt
from time import time, sleep
from tqdm import tqdm

# Own files
import utils 
import StanClasses

In [2]:
# Define constants
DATA_DIR = 'ml-100k'

In [3]:
df, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=50, subsample_top_items=20)
df[['user_id', 'item_id']] -= 1

# We are not going to use timestamp, therefore drop it
df.drop('timestamp', axis='columns', inplace=True)

In [4]:
def column_relabler(df: pd.DataFrame, column: str):
    uniques = pd.value_counts(df[column], sort=False).index.values
    n_uniques = len(uniques)

    num2id = {num_:id_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    id2num = {id_:num_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    
    df[column] = df[column].map(id2num)
    return id2num, num2id

df_num = df.copy()
user2num, num2user = column_relabler(df_num, 'user_id')
item2num, num2item = column_relabler(df_num, 'item_id')

# p, q represents shape of the matrix as if it was dense
p, q = len(user2num), len(item2num)

In [5]:
df_train, df_val = train_test_split(df_num, test_size=0.1, random_state=seed)

In [6]:
print('test')

test


In [7]:
models = [
    StanClasses.NormalFactorizer,
    StanClasses.NonNegativeFactorizer,
    StanClasses.ARD_Factorizer
]

init_kwargs = {'n_components':[1,2,3,4,5]}
static_kwargs = {'chains':1, 'iter':1000, 'control':{'max_treedepth':15}}

t0 = time()
hist = utils.fit_and_evaluate_models(
    models=models,
    X_train=df_train,
    X_val=df_val,
    candidate_kwargs=init_kwargs,
    static_kwargs=static_kwargs
)
evaltime = time()-t0

Fitting models:   0%|          | 0/15 [00:00<?, ?model/s]

Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel insta

In [8]:
df_hist = pd.DataFrame(hist)

In [9]:
df_latex = df_hist.copy()
df_latex['k'] = df_latex['model'].apply(lambda x: x.n_components)
df_latex['model'] = df_latex['model'].apply(lambda x: type(x).__name__)
df_latex

Unnamed: 0,model,params,fit_time,train_mae,val_mae,k
0,NonNegativeFactorizer,{'n_components': 1},2.671167,0.594762,0.706365,1
1,NormalFactorizer,{'n_components': 1},4.589451,0.594665,0.707214,1
2,ARD_Factorizer,{'n_components': 1},15.007355,0.59452,0.707767,1
3,NonNegativeFactorizer,{'n_components': 2},15.1635,0.563302,0.740666,2
4,NonNegativeFactorizer,{'n_components': 3},20.120343,0.528878,0.745138,3
5,NonNegativeFactorizer,{'n_components': 4},21.366143,0.511991,0.780344,4
6,NonNegativeFactorizer,{'n_components': 5},25.425939,0.504013,0.784889,5
7,ARD_Factorizer,{'n_components': 4},27.780926,0.52233,0.737953,4
8,ARD_Factorizer,{'n_components': 2},27.9503,0.563505,0.723465,2
9,NormalFactorizer,{'n_components': 2},30.508262,0.560526,0.731154,2


In [10]:
df_latex.sort_values('model')

Unnamed: 0,model,params,fit_time,train_mae,val_mae,k
2,ARD_Factorizer,{'n_components': 1},15.007355,0.59452,0.707767,1
7,ARD_Factorizer,{'n_components': 4},27.780926,0.52233,0.737953,4
8,ARD_Factorizer,{'n_components': 2},27.9503,0.563505,0.723465,2
10,ARD_Factorizer,{'n_components': 3},32.759497,0.530216,0.726246,3
11,ARD_Factorizer,{'n_components': 5},35.501988,0.516672,0.743765,5
0,NonNegativeFactorizer,{'n_components': 1},2.671167,0.594762,0.706365,1
3,NonNegativeFactorizer,{'n_components': 2},15.1635,0.563302,0.740666,2
4,NonNegativeFactorizer,{'n_components': 3},20.120343,0.528878,0.745138,3
5,NonNegativeFactorizer,{'n_components': 4},21.366143,0.511991,0.780344,4
6,NonNegativeFactorizer,{'n_components': 5},25.425939,0.504013,0.784889,5


In [11]:
best_model = df_hist['model'].values[0]
best_params = df_hist['params'].values[0].copy()
best_params.update(static_kwargs)

In [12]:
df_full, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=60, subsample_top_items=30)
df_full[['user_id', 'item_id']] -= 1

user2num, num2user = column_relabler(df_full, 'user_id')
item2num, num2item = column_relabler(df_full, 'item_id')

# We are not going to use timestamp, therefore drop it
df_full.drop('timestamp', axis='columns', inplace=True)

df_full_train, df_full_val = train_test_split(df_full, test_size=0.05, random_state=seed)

In [13]:
print(df_full.shape)
print(df_full_train.shape)
print(df_full_val.shape)

(1639, 3)
(1557, 3)
(82, 3)


In [14]:
final_model_object, fit_time, train_mae, val_mae =\
    utils.fit_and_evaluate((type(best_model), best_params, df_full_train, df_full_val))

Using cached StanModel


In [None]:
import numpy as np
seed = 42069
np.random.seed(seed)
import pandas as pd
from matplotlib import pyplot as plt
import arviz
import pystan
from scipy import sparse, stats
from typing import Iterable, Union, Callable
from sklearn.model_selection import train_test_split, ParameterGrid
import altair as alt
from time import time, sleep
from tqdm import tqdm
from multiprocessing import Pool
import pickle

# Own files
import utils 
import StanClasses

# Define constants
DATA_DIR = 'ml-100k'

df, _, _ = utils.get_ml100k_data(DATA_DIR, subsample_top_users=25, subsample_top_items=25)
df[['user_id', 'item_id']] -= 1

# We are not going to use timestamp, therefore drop it
df.drop('timestamp', axis='columns', inplace=True)

def column_relabler(df: pd.DataFrame, column: str):
    uniques = pd.value_counts(df[column], sort=False).index.values
    n_uniques = len(uniques)

    # Count from 1 to conform with Stan (Stan counts indexes arrays starting at 1)
    num2id = {num_:id_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    id2num = {id_:num_ for num_, id_ in zip(range(0, n_uniques), uniques)}
    
    df[column] = df[column].map(id2num)
    return id2num, num2id

df_num = df.copy()
user2num, num2user = column_relabler(df_num, 'user_id')
item2num, num2item = column_relabler(df_num, 'item_id')

# p, q represents shape of the matrix as if it was dense
p, q = len(user2num), len(item2num)

df_train, df_val = train_test_split(df_num, test_size=0.1, random_state=seed)

print(f'''Dataframe dimensions:

    df_train: {df_train.shape}
    df_val: {df_val.shape}
    ''')

models = [
    StanClasses.NormalFactorizer,
    StanClasses.NonNegativeFactorizer,
    StanClasses.ARD_Factorizer
]

init_kwargs = {'n_components':[1,2,3,4,5]}
static_kwargs = {'chains':1, 'iter':1200, 'control':{'max_treedepth':15}}

t0 = time()
hist = utils.fit_and_evaluate_models(
    models=models,
    X_train=df_train,
    X_val=df_val,
    candidate_kwargs=init_kwargs,
    static_kwargs=static_kwargs,
    ascii=True
)
evaltime = time()-t0
print('evaltime: ', evaltime)

df_hist = pd.DataFrame(hist)
df_hist.sort_values('val_mae', inplace=True)
# df_hist.to_pickle('histpickle_withmodels4.pkl')

best_model = df_hist['model'].values[0]
best_params = df_hist['params'].values[0].copy()
best_params.update(static_kwargs)

df_full = df_num.copy()

# We are not going to use timestamp, therefore drop it
# df_full.drop('timestamp', axis='columns', inplace=True)

# final_dict4 uses 0.1 test_size, while previous ones use 0.05
df_full_train, df_full_val = train_test_split(df_full, test_size=0.1, random_state=seed)

final_model_object, fit_time, train_mae, val_mae =\
    utils.fit_and_evaluate((type(best_model), best_params, df_full_train, df_full_val))

hist2 = {
    'model':final_model_object,
    'params':best_params,
    'fit_time':fit_time,
    'train_mae':train_mae,
    'val_mae':val_mae
}

# with open('final_dict4.pkl', 'wb') as f:
#     pickle.dump(hist2, f)

Dataframe dimensions:

    df_train: (532, 3)
    df_val: (60, 3)
    


Fitting models:   0%|          | 0/15 [00:00<?, ?model/s]

Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel
Using cached StanModel


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel insta