In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    roc_auc_score,
    log_loss,
)

In [3]:
from spotlight.datasets.movielens import get_movielens_dataset

dataset = get_movielens_dataset(variant='100K')
print(dataset)

<Interactions dataset (944 users x 1683 items x 100000 interactions)>


In [4]:
dataset.user_ids

array([196, 186,  22, ..., 276,  13,  12])

In [5]:
dataset.item_ids

array([ 242,  302,  377, ..., 1090,  225,  203])

В этом методе встроенный метод загрузки данных, все дальнейшие функции на этом завязаны, изменить мне это не удалось. Дальше я загрузила свои даннные, которые использую в CTR и автоэнкодере, и они совпадают. 

In [6]:
df = pd.read_csv("C:\\Users\\Lina\\ml-100k\\ml-100k\\u.data", sep="\t", names=['user_ids', 'item_ids', 'rating','timestamp'])
df

Unnamed: 0,user_ids,item_ids,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


We can feed our dataset to the ExplicitFactorizationModel class - and sklearn-like object that allows us to train and evaluate the explicit factorization models.

Internally, the model uses the BilinearNet class to represents users and items. It's composed of a 4 embedding layers:

* a (num_users x latent_dim) embedding layer to represent users,
* a (num_items x latent_dim) embedding layer to represent items,
* a (num_users x 1) embedding layer to represent user biases, and
* a (num_items x 1) embedding layer to represent item biases.

Together, these give us the predictions. Their accuracy is evaluated using one of the Spotlight losses. In this case, we'll use the regression loss, which is simply the squared difference between the true and the predicted rating.

In [7]:
import torch

from spotlight.factorization.explicit import ExplicitFactorizationModel

model = ExplicitFactorizationModel(loss='regression',
                                   embedding_dim=128,  # latent dimensionality
                                   n_iter=10,  # number of epochs of training
                                   batch_size=1024,  # minibatch size
                                   l2=1e-9,  # strength of L2 regularization
                                   learning_rate=1e-3,
                                   use_cuda=torch.cuda.is_available())

In [8]:
from spotlight.cross_validation import random_train_test_split
from sklearn.model_selection import train_test_split

train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42))
#train, test = train_test_split(df, test_size=0.2)

print('Split into \n {} and \n {}.'.format(train, test))

Split into 
 <Interactions dataset (944 users x 1683 items x 85000 interactions)> and 
 <Interactions dataset (944 users x 1683 items x 15000 interactions)>.


In [9]:
history=model.fit(train, verbose=True)

Epoch 0: loss 12.990236259642101
Epoch 1: loss 6.567769289016724
Epoch 2: loss 1.5581430537360055
Epoch 3: loss 1.0407726211207253
Epoch 4: loss 0.934924916142509
Epoch 5: loss 0.8922242465473357
Epoch 6: loss 0.8821919496570315
Epoch 7: loss 0.8685591639507384
Epoch 8: loss 0.8514696118377504
Epoch 9: loss 0.8441359652649789


In [10]:
from spotlight.evaluation import rmse_score

train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))

Train RMSE 0.907, test RMSE 0.944


In [11]:
 predictions = model.predict(test.user_ids, test.item_ids)

In [12]:
mean_squared_error(test.ratings, predictions)

0.89154094

In [13]:
mean_absolute_error(test.ratings, predictions)

0.74426913

In [14]:
r2_score(test.ratings, predictions)

0.30168220183221695

In [15]:
explained_variance_score(test.ratings, predictions)

0.30174070596694946