In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
import torch.nn as nn
from torch.autograd import Variable

from netrex.netrex import FactorizationModel, SequenceModel, generate_sequences
from netrex.evaluation import auc_score, mrr_score

from netrex.utils import _cpu, _gpu, _minibatch  # deal with it
from netrex.netrex2 import ShitNet
import itertools
from functools import reduce
from operator import add, mul

from typing import List, Dict

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

embedding_dim = 10
minibatch_size = 2048
n_iter = 10
cuda = False

PATH_INTERACTIONS = '../data/snap_amazon/video_games/ratings_unary.msg'
PATH_USER_FEATS = '../data/snap_amazon/video_games/user_feats.msg'
PATH_ITEM_FEATS = '../data/snap_amazon/video_games/item_feats.msg'
PATH_ITEM_IMG_EMBS = '../data/snap_amazon/video_games/img_embs.msg'

user_col = 'user_id'
item_col = 'asin'
ts_col = 'timestamp'

context_cols = ['month', 'dayofweek']

SEED = 322

split_date = '2014-01-01'

# Load and some more Preproc

In [2]:
xns_df = pd.read_msgpack(PATH_INTERACTIONS).sample(100000, random_state=SEED)
user_feats_df = pd.read_msgpack(PATH_USER_FEATS)
item_feats_df = pd.read_msgpack(PATH_ITEM_FEATS)
item_imgs_df = pd.read_msgpack(PATH_ITEM_IMG_EMBS)

In [3]:
# Dummy context features
xns_df['month'] = xns_df['timestamp'].dt.month
xns_df['dayofweek'] = xns_df['timestamp'].dt.dayofweek

In [4]:
# Train-Val split
# in reality, user_feats_df and item_feats_df should have new categories removed too
train_df = xns_df.loc[xns_df[ts_col] < split_date].copy()
val_df = xns_df.loc[xns_df[ts_col] >= split_date].copy()
print(train_df.shape)
print(val_df.shape)

(81513, 5)
(18487, 5)


In [5]:
# Categorical dtype
for col in [user_col, item_col] + context_cols:
    train_df[col] = train_df[col].astype('category')
    val_df[col] = val_df[col].astype('category', categories=train_df[col].cat.categories)
    
cat_d = {
    col: pd.CategoricalIndex(train_df[col].cat.categories, name=col)
    for col in [user_col, item_col] + context_cols
}

In [6]:
(val_df[user_col].notnull() & val_df[item_col].notnull()).value_counts()

False    17826
True       661
dtype: int64

In [7]:
# Add User and Item Feats to catalog
for col in user_feats_df.columns:
    cat_d[col] = pd.CategoricalIndex(user_feats_df[col].cat.categories, name=col)
for col in item_feats_df.columns:
    cat_d[col] = pd.CategoricalIndex(item_feats_df[col].cat.categories, name=col)

# Book-keeping methods

In [78]:
def lookup(feature_fields_ids: Dict[str, Variable],
           feature_maps: Dict[str, nn.Embedding],
           col_dep: str) -> Dict[str, Variable]:
    """
    Lookup values with prerequisites in `feature_fields_ids`
    These results will eventually find their way in `feature_fields_ids`
    """
    return {
        feat_name: feat_map(feature_fields_ids[col_dep]).float().squeeze() if 'img' in feat_name else 
        feat_map(feature_fields_ids[col_dep]).long().squeeze()
        for feat_name, feat_map in feature_maps.items()
    }
        
    
def lookups_via_features(feats_df: pd.DataFrame, index_cats) -> Dict[str, nn.Embedding]:
    """
    feats_df: user or item features
        index should be user or item id such that the ordering matches user or item codes
        each column should be of dtype category
    # n_indices: number of users or number of items
    index_cats: user or item index catgories (cat_d[user_col].categories) 
    feature_maps: to be used to convert a user or item id into the corresponding feature code
    """
    n_indices = len(index_cats)
    feature_maps = {}
    for col in feats_df.columns:
        emb = nn.Embedding(n_indices, embedding_dim=1, sparse=False)
        emb.weight = nn.Parameter(torch.from_numpy(
            feats_df.loc[index_cats][col].cat.codes.values[:, None].astype('float32')))
        emb.weight.requires_grad = False
        feature_maps[col] = emb
    return feature_maps


user_feature_maps = lookups_via_features(user_feats_df, cat_d[user_col])
item_feature_maps = lookups_via_features(item_feats_df, cat_d[item_col])

# Img Embs

In [79]:
IMG_EMB_DIMS = item_imgs_df.shape[1]
n_indices = len(cat_d[item_col].categories)

img_in = nn.Embedding(n_indices, embedding_dim=IMG_EMB_DIMS, sparse=False)
img_in.weight = nn.Parameter(torch.from_numpy(
    item_imgs_df.loc[cat_d[item_col].categories].fillna(0).values.astype('float32')))
img_in.weight.requires_grad = False

# img_to_emb = nn.Linear(item_imgs_df.shape[1], embedding_dim)

In [80]:
item_feature_maps['img'] = img_in

In [59]:
item_feature_maps['img'](Variable(torch.from_numpy(np.array([0,1]))))

Variable containing:
  2.2406   0.0000   0.0000  ...    0.0000   0.6221   0.0000
  0.0000   0.0000   0.0000  ...    0.0000   0.0000   0.0000
[torch.FloatTensor of size 2x4096]

# 

In [81]:
def get_fields_ids(user_ids: Variable,
                   item_ids: Variable,
                   contexts: Variable) -> Dict[str, Variable]:
    feature_fields_ids = {
        user_col: user_ids,
        item_col: item_ids,
    }
    feature_fields_ids.update(lookup(feature_fields_ids, user_feature_maps, user_col))
    feature_fields_ids.update(lookup(feature_fields_ids, item_feature_maps, item_col))
    feature_fields_ids.update({col: contexts[:, ii] for ii, col in enumerate(context_cols)})
    # Potentially more complex feature dependency graphs
    
    return feature_fields_ids

# Model

In [93]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from netrex.layers import ScaledEmbedding, ZeroEmbedding
from netrex.utils import _cpu, _gpu, _minibatch  # deal with it
from typing import Dict, List, Tuple
import pandas as pd
import itertools
from functools import reduce
from operator import add, mul
from time import time
CatsDict = Dict[str, pd.CategoricalIndex]

class FNet(nn.Module):
    
    def __init__(self, cats_d: CatsDict, embedding_dim: int, sparse=False):
        super().__init__()

        self.embedding_dim = embedding_dim

        # TODO: I think the book-keeping can be done within
        # just using named modules
        self.embeddings_d = {
            field: ScaledEmbedding(len(cat_index.categories), embedding_dim,
                                   sparse=sparse)
            for field, cat_index in cats_d.items()
        }
        self.biases_d = {
            field: ZeroEmbedding(len(cat_index.categories), 1,
                                 sparse=sparse)
            for field, cat_index in cats_d.items()
        }

        for field, module in self.embeddings_d.items():
            self.add_module('embedding_{}'.format(field), module)
        for field, module in self.biases_d.items():
            self.add_module('bias_{}'.format(field), module)
            
        self.add_module('img_to_emb', nn.Linear(IMG_EMB_DIMS, embedding_dim))
        self.add_module('img_to_bias', nn.Linear(IMG_EMB_DIMS, 1))

    def forward(self, user_ids, item_ids, contexts):

        feature_fields_ids = get_fields_ids(user_ids, item_ids, contexts)
        
        # TODO: case with img should be generalized
        
        # Iter of batch_size X embedding_dim tensors
        embeddings = [self.embeddings_d[fields](ids)
                      for fields, ids in feature_fields_ids.items() if fields != 'img']
        biases = [self.biases_d[fields](ids)
                  for fields, ids in feature_fields_ids.items()  if fields != 'img']
        
        embeddings.append(self.img_to_emb(feature_fields_ids['img']))
        biases.append(self.img_to_bias(feature_fields_ids['img']))
        

        contrib_dot = reduce(add, (
            mul(*pair).sum(1)
            for pair in itertools.combinations(embeddings, 2))
                             )
        contrib_bias = reduce(add, biases)

        return contrib_dot + contrib_bias

In [94]:
class FModel(object):

    def __init__(self,
                 col_cat_index_d,
                 loss='bpr',
                 embedding_dim=64,
                 n_iter=1,
                 batch_size=1024,
                 l2=0.0,
                 use_cuda=False,
                 sparse=False):

        self.col_cat_index_d = col_cat_index_d
        
        self._loss = loss
        self._embedding_dim = embedding_dim
        self._n_iter = n_iter
        self._batch_size = batch_size
        self._l2 = l2
        self._use_cuda = use_cuda
        self._sparse = sparse

        self._num_users = None
        self._num_items = None
#         self._net = None
        self._net = _gpu(
            FNet(self.col_cat_index_d, self._embedding_dim),
            self._use_cuda
        )


    def fit(self, interactions_df, verbose=False):
        self._num_users, self._num_items = (len(interactions_df[col].cat.categories)
                                            for col in [user_col, item_col])
        
        interactions = sp.csr_matrix(
            (np.ones(len(interactions_df)),
             (interactions_df[user_col].cat.codes,
              interactions_df[item_col].cat.codes)),
            shape=(self._num_users, self._num_items), dtype=np.float32)
        
        

        if self._sparse:
            optimizer = optim.Adagrad(self._net.parameters(),
                                      weight_decay=self._l2)
        else:
            optimizer = optim.Adam(self._net.parameters(),
                                   weight_decay=self._l2)

#         loss_fnc = self._bpr_loss
        
        for epoch_num in range(self._n_iter):
            # todo: shuffle in a more efficient way?
            interactions_df = interactions_df.sample(frac=1)
            # Storing in tensors
            user_ids_tensor = _gpu(torch.from_numpy(interactions_df[user_col].cat.codes.values.astype('int64')),
                                   cuda)
            item_ids_tensor = _gpu(torch.from_numpy(interactions_df[item_col].cat.codes.values.astype('int64')),
                                   cuda)
            context_codes_df = pd.concat([interactions_df[col].cat.codes for col in context_cols], axis=1)
            context_codes_df.columns = context_cols

            context_tensor = _gpu(torch.from_numpy(context_codes_df.values.astype('int64')),
                                  cuda)

            epoch_loss = 0.0
            ii = 0
            tic = time()
            for (batch_user, batch_item, batch_context) in zip(
                _minibatch(user_ids_tensor, self._batch_size),
                _minibatch(item_ids_tensor, self._batch_size),
                _minibatch(context_tensor, self._batch_size)):

                user_var = Variable(batch_user)
                pos_item_var = Variable(batch_item)
                context_var = Variable(batch_context)

                pos_score = self._net(user_var, pos_item_var, context_var)
                
                # TODO: there is some repeat work in `neg_feature_fields_ids`
                # that was already done in `pos_feature_fields_ids`
                neg_item_var = Variable(_gpu(
                    torch.from_numpy(np.random.randint(0,
                                                       self._num_items,
                                                       len(pos_item_var))),
                    self._use_cuda))
                neg_score = self._net(user_var, neg_item_var, context_var)

                optimizer.zero_grad()

                loss = (1.0 - F.sigmoid(pos_score -neg_score)).mean()
                epoch_loss += loss.data[0]

                loss.backward()
                optimizer.step()
                
                ii += 1
                if verbose and (ii%50)==0:
                    print('{}: {} \t {}'.format(ii, time()-tic, loss.mean().data[0]))

            if verbose:
                print('Epoch {}: loss {}'.format(epoch_num, epoch_loss))


In [99]:
model = FModel(cat_d, embedding_dim=10, batch_size=256)

In [100]:
val_in_train_df = val_df.loc[val_df[user_col].notnull() & val_df[item_col].notnull()]

In [90]:
# # Fit 1 epoch
# model.fit(train_df)

In [97]:
# How can we eval?
# eval will be slow because we cannot groupby user
# since we are considering context... oh well
# Let's use a ghetto mean avg precision

def score_model(model, val_in_train_df, k=100):

    user_ids_tensor = _gpu(torch.from_numpy(val_in_train_df[user_col].cat.codes.values.astype('int64')),
                           cuda)
    item_ids_tensor = _gpu(torch.from_numpy(val_in_train_df[item_col].cat.codes.values.astype('int64')),
                           cuda)
    context_codes_df = pd.concat([val_in_train_df[col].cat.codes for col in context_cols], axis=1)
    context_codes_df.columns = context_cols

    context_tensor = _gpu(torch.from_numpy(context_codes_df.values.astype('int64')),
                          cuda)

    items_all = Variable(
        _gpu(torch.arange(0, model._num_items).long(),
             model._use_cuda)
    )

    correct_in_k = []

    for (batch_user, batch_item, batch_context) in zip(
        _minibatch(user_ids_tensor, 1),
        _minibatch(item_ids_tensor, 1),  # Doesn't really need to be tensorized
        _minibatch(context_tensor, 1)):


        user_rep = Variable(batch_user.repeat(model._num_items))
        context_rep = Variable(batch_context.repeat(model._num_items, 1))
        preds = model._net(
            user_rep,
            items_all,
            context_rep
        )

        correct_in_k.append(batch_item.numpy() in np.argpartition(-preds.data.numpy(), kth=k, axis=0)[:k])
        gmap = np.mean(correct_in_k)
    return gmap

In [101]:
%%time
model = FModel(cat_d, embedding_dim=10, batch_size=1024)
scores = []
for ep in range(30):
    model.fit(train_df, verbose=True)
    if ep in {2, 10, 20, 30}:
        score = score_model(model, val_in_train_df, k=10)
        print(ep, score)
        scores.append(score)

50: 5.2072227001190186 	 0.33691588044166565
Epoch 0: loss 28.170246958732605
50: 5.7917163372039795 	 0.27147749066352844
Epoch 0: loss 23.173319905996323
50: 5.1279871463775635 	 0.25476980209350586
Epoch 0: loss 20.881321147084236
2 0.0257186081694
50: 5.141080379486084 	 0.2327200472354889
Epoch 0: loss 19.22147636115551
50: 5.16515588760376 	 0.23553845286369324
Epoch 0: loss 18.07867057621479
50: 5.158850193023682 	 0.20227046310901642
Epoch 0: loss 17.20361191034317
50: 5.168084383010864 	 0.2085639238357544
Epoch 0: loss 16.431586518883705
50: 5.087740182876587 	 0.2276940494775772
Epoch 0: loss 16.072343230247498
50: 5.078282833099365 	 0.1743677258491516
Epoch 0: loss 15.582215443253517
50: 5.12999963760376 	 0.17785198986530304
Epoch 0: loss 15.104512050747871
50: 5.25907301902771 	 0.1822538673877716
Epoch 0: loss 14.693519160151482
10 0.0302571860817
50: 5.393547534942627 	 0.16423487663269043
Epoch 0: loss 14.53006187081337
50: 5.259625434875488 	 0.18090477585792542
Epoc

In [None]:
plt.plot(scores)