In [1]:
import numpy as np
import pandas as pd
import fastFM as fm
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
import util

In [3]:
df_train = pd.read_csv('../data/ml-100k/ua.base', header=None, names=['uid', 'sid', 'score', 'ts'], sep='\t')

In [4]:
df_test = pd.read_csv('../data/ml-100k/ua.test', header=None, names=['uid', 'sid', 'score', 'ts'], sep='\t')

In [5]:
df = df_train

In [6]:
df.head()

Unnamed: 0,uid,sid,score,ts
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [7]:
!wc -l ../data/ml-100k/ua.base

   90570 ../data/ml-100k/ua.base


In [8]:
!wc -l ../data/ml-100k/ua.test

    9430 ../data/ml-100k/ua.test


In [9]:
import imp
util = imp.reload(util)
X_base, Y_base = util.load_dataset_mf('../data/ml-100k/ua.base', user_cnt=943, item_cnt=1682, entry_cnt=90570)
X_test, Y_test = util.load_dataset_mf('../data/ml-100k/ua.test', user_cnt=943, item_cnt=1682, entry_cnt=9430)

In [10]:
### classification

In [11]:
y_base = np.ones_like(Y_base)
y_test = np.ones_like(Y_test)
y_base[Y_base < np.mean(Y_base)] = 0
y_base[Y_base >= np.mean(Y_base)] = 1
y_test[Y_test < np.mean(Y_test)] = 0
y_test[Y_test >= np.mean(Y_base)] = 1
X_train, X_dev, y_train, y_dev = train_test_split(X_base.tocsr(), y_base)

In [12]:
# check balance
print('threshold', np.mean(Y_base))
print(np.count_nonzero(y_train)/len(y_train))
print(np.count_nonzero(y_dev)/len(y_dev))
print(np.count_nonzero(y_test)/len(y_test))
y_train[y_train == 0] = -1
y_dev[y_dev == 0] = -1
y_test[y_test == 0] = -1

threshold 3.5238268
0.5500610949990431
0.5539018681270149
0.5799575821845175


In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(67927, 2625)
(67927,)
(9430, 2625)
(9430,)


In [14]:
from fastFM import sgd
fm = sgd.FMClassification(n_iter=100000, init_stdev=0.1, l2_reg_w=0.1,
                          l2_reg_V=0.1, rank=5, step_size=0.01)
fm.fit(X_train, y_train)
util.evaluate_classification(fm, X_train, y_train)
util.evaluate_classification(fm, X_dev, y_dev)
util.evaluate_classification(fm, X_test, y_test)

acc: 0.6755487508648991
auc: 0.7532831520857672
acc: 0.6666519454135936
auc: 0.7388112009384533
acc: 0.6535524920466595
auc: 0.7029857161447352


array([1., 1., 1., ..., 1., 1., 1.])

In [15]:
from fastFM import als
fm = als.FMClassification(n_iter=100, init_stdev=0.1, l2_reg_w=0.1,
                          l2_reg_V=0.1, rank=5)
fm.fit(X_train, y_train)
util.evaluate_classification(fm, X_train, y_train)
util.evaluate_classification(fm, X_dev, y_dev)
util.evaluate_classification(fm, X_test, y_test)

acc: 0.8343663050039013
auc: 0.9136794702512214
acc: 0.6932385284635428
auc: 0.7368201283446061
acc: 0.6590668080593849
auc: 0.68987378263725


array([ 1., -1.,  1., ..., -1., -1., -1.])

In [None]:
### regression

In [17]:
X_train, X_dev, y_train, y_dev = train_test_split(X_base, Y_base)
y_test = Y_test

In [31]:
from fastFM import sgd
fm = sgd.FMRegression(n_iter=1000000, init_stdev=0.1, rank=2, l2_reg_w=1.2, l2_reg_V=1.5, step_size=0.01)
fm.fit(X_train, y_train)
ytr = util.evaluate_regression(fm, X_train, y_train)
yde = util.evaluate_regression(fm, X_dev, y_dev)
yte = util.evaluate_regression(fm, X_test, Y_test)

mse: 0.9542099082863685
mse: 0.9939525598383108
mse: 1.0078336292812073


In [32]:
from fastFM import als
fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=2, l2_reg_w=1.2, l2_reg_V=1.5)
fm.fit(X_train, y_train)
ytr = util.evaluate_regression(fm, X_train, y_train)
yde = util.evaluate_regression(fm, X_dev, y_dev)
yte = util.evaluate_regression(fm, X_test, Y_test)

mse: 0.6722046720000865
mse: 0.8670755264890571
mse: 0.9064960047646371


In [None]:
util = imp.reload(util)

In [33]:
print(list(zip(ytr[:5], y_train[:5])))
print(list(zip(yde[:5], y_dev[:5])))
print(list(zip(yte[:5], y_test[:5])))

[(4.081599716916964, 3.0), (3.945920063344455, 5.0), (2.375715365264723, 5.0), (3.3877351550626464, 4.0), (4.094932353902069, 4.0)]
[(4.0294320745238155, 5.0), (4.0528359714475695, 4.0), (3.4942065305762795, 4.0), (1.5253912814993058, 1.0), (3.842871572846442, 4.0)]
[(4.083728730517474, 4.0), (3.7472232287873086, 4.0), (4.153650721247238, 4.0), (3.254656074497097, 3.0), (2.517268886213949, 2.0)]
