In [83]:
import os 
import shutil
import sys
import numpy as np 
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sn 
import pandas as pd 
import torch
from torch import nn, optim
from torch.nn import functional as F 
from tqdm import tqdm
import bottleneck as bn 

In [85]:
def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1) # top k
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0)
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

In [84]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(int(n), k)]).sum() for n in heldout_batch.sum(axis=1)])
    return DCG / IDCG

In [2]:
DATA_DIR = '/home/hieutk/Recommend/VAE_CF/ml-20m'

raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), header=0)

raw_data = raw_data[raw_data['rating'] > 3.5]

raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703


In [3]:
def get_count(tp, id) :
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [34]:
def filter_triplets(tp, min_uc=5, min_sc=0) :
    # Only keep the triplets for items which were clicked on by at least min_sc users.
    if min_sc > 0 :
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount['size'] >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0 :
        usercount = get_count(tp, 'userId') 
        tp = tp[tp['userId'].isin(usercount.index[usercount['size'] >= min_uc])]
    
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
    return tp, usercount, itemcount

In [35]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [36]:
sparsity = 1. * raw_data.shape[0] / user_activity.shape[0] / item_popularity.shape[0]

print(sparsity)

0.003498014804813546


In [44]:
unique_uid = user_activity.index

np.random.seed(42)

idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [45]:
# create train/test/val split

n_users = unique_uid.size
n_heldout_users = 10000

train_idx = n_users - n_heldout_users * 2
test_idx = train_idx + n_heldout_users

train_u = uid[: train_idx]
test_u = uid[train_idx : test_idx]
val_u = uid[test_idx:]

train_data = raw_data.loc[raw_data['userId'].isin(train_u)]

In [46]:
unique_iid = pd.unique(train_data['movieId'])

In [47]:
item2id = dict((iid, idx) for (idx, iid) in enumerate(unique_iid))
user2id = dict((uid, idx) for (idx, uid) in enumerate(unique_uid))

In [48]:
pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_iid.txt'), 'w') as f:
    for iid in unique_iid:
        f.write('%s\n' % iid)

In [54]:
def split_train_test(data, prop=0.2) :
    data_groubyuser = data.groupby('userId')
    train = []
    test = []

    for i, (_, group) in enumerate(data_groubyuser) :
        n_items_u = len(group)

        if n_items_u >= 5 :
            idx = np.zeros(n_items_u, dtype='bool')
            count = int(n_items_u * prop)
            idx[np.random.choice(n_items_u, size=count, replace=False).astype('int64')] = True

            train.append(group[np.logical_not(idx)])
            test.append(group[idx])
        else :
            train.append(group)
        
        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()
        
    data_train = pd.concat(train)
    data_test = pd.concat(test)
        
    return data_train, data_test

In [59]:
val_data = raw_data.loc[raw_data['userId'].isin(val_u)]
val_data = val_data.loc[val_data['movieId'].isin(unique_sid)]

In [60]:
val_train, val_test = split_train_test(val_data)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [61]:
test_data = raw_data.loc[raw_data['userId'].isin(test_u)]
test_data = test_data.loc[test_data['movieId'].isin(unique_sid)]

test_train, test_test = split_train_test(test_data)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [69]:
def numerize(df) :
    uid = list(map(lambda x: user2id[x], df['userId']))
    iid = list(map(lambda x: item2id[x], df['movieId']))
    return pd.DataFrame(data={'uid':uid, 'iid':iid}, columns=['uid','iid'])

In [70]:
train_data_ = numerize(train_data)
train_data_.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

val_tr = numerize(val_train)
val_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

val_te = numerize(val_test)
val_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_tr = numerize(test_train)
test_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_te = numerize(test_test)
test_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

In [72]:
class Encoder(nn.Module) :
    def __init__(self, dropout=0.5, q_dims=[20108, 600, 200]) :
        super(Encoder, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout, in_place=False)
        self.q_dims = q_dims
        self.fc1 = nn.Linear(self.q_dims[0], self.q_dims[1], bias=True)
        self.fc2 = nn.Linear(self.q_dims[1], 2*self,q_dims[2], bias=True)
        self.tanh = nn.Tanh()

    def foward(self, x) :
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.tanh(x)
        x = self.fc2(x)

        mu, logvar = torch.chunk(x, chunks=2, dim=1)
        return mu, logvar

In [75]:
class Decoder(nn.Module) :
    def __init__(self, p_dims=[200, 600, 20108]) :
        super(Decoder, self).__init__()
        
        self.p_dims = p_dims
        self.fc1 = nn.Linear(self.p_dims[0], self.p_dims[1], bias=True)
        self.fc2 = nn.Linear(self.p_dims[1], self,p_dims[2], bias=True)
        self.tanh = nn.Tanh()

    def foward(self, x) :
        x = self.fc1(x)
        x = self.tanh(x)
        x = self.fc2(x)
        return x

In [79]:
class MultiVAE(nn.Module) :
    def __init__(self, cuda, weight_decay=0.0, dropout=0.5, q_dims=[20108,600,200], p_dims=[200,600,20108], n_conditioned=0) :
        assert q_dims[0] == p_dims[-1] and q_dims[-1] == p_dims[0], 'dimmension of AE not match'

        self.weight_decay = weight_decay
        self.n_conditioned = n_conditioned
        self.q_dims = q_dims
        self.p_dims = p_dims
        self.q_dims[0] += self.n_conditioned
        self.p_dims[0] += self.n_conditioned

        self.enc = Encoder(dropout=dropout, q_dims=self.q_dims)
        self.dec = Decoder(p_dims=self.p_dims)
    
    def forward(self, x, c) :
        x = f.normalise(x, p=2, dim=1)
        if self.n_conditioned > 0 :
            x = torch.cat((x,c), dim=1)
        
        mu_q, logvar_q = self.enc(x)
        std_q = torch.exp(0.5 * logvar_q)
        kl = torch.mean(torch.sum(0.5 * (-logvar_q + torch.exp(logvar_q) + mu_q**2 - 1), dim=1))
        epsilon = torch.randn_like(std_q, requres_grad=False)

        if True :
            if self.training : 
                sampled_z = mu_q + epsilon * std_q
            else :
                sampled_z = mu_q
        else :
            sampled_z = mu_q + epsilon * std_q
        
        if self.n_conditioned > 0 :
            sampled_z = torch.cat((sampled_z, c), dim=1)
        logits = self.dec(sampled_z)
        
        return logits, kl, mu_q, std_q, epsilon, sampled_z
    
    def get_l2_reg(self) :
        l2_reg = torch.autograd.Variable(torch.FloatTensor(1), requires_grad=True)
        
        if self.weight_decay > 0 :
            for k, m in self.state_dict().items() :
                if k.endswith('.weight') :
                    l2_reg = l2_reg + torch.norm(m, p=2)**2
        if self.cuda :
            l2_reg = l2_reg.cuda()
        return self.weight_decay * l2_reg[0]

In [86]:
class Trainer(object) :
    def __init__(
        self, model, cuda, optimizer, train_loader, 
        test_loader, val_loader, interval_valid, 
        anneal_step=2e5, anneal_cap=0.2
    ) :
        super(Trainer, self).__init__()
        
        self.model = model
        self.cuda = cuda
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.val_loader = val_loader
        self.interval_valid = interval_valid
        self.step = 0
        self.anneal_step = anneal_step
        self.anneal_cap = anneal_cap
        self.n20_all = []
        self.n20_max_va, self.n100_max_va, self.r20_max_va, self.r50_max_va = 0, 0, 0, 0
        self.n20_max_te, self.n100_max_te, self.r20_max_te, self.r50_max_te = 0, 0, 0, 0

    #training epoch:
    def train_epoch(epoch) :
        #mark model as training
        self.model.train()
        
        for batch_id, (data_tr, data_te, prof) in tqdm(enumerate(self.train_loader)) :
            self.step += 1

            if self.cuda :
                data_tr = data_tr.cuda()
                prof = prof.cuda()
            
            data_tr = torch.autograd.Variable(data_tr)
            prof = torch.autograd.Variable(prof)

            logits, KL, mu_q, std_q, epsilon, sampled_z = self.model(data_tr, prof)
            log_softmax = f.log_softmax(logits, dim=1)
            neg_ll = torch.mean(torch.sum(log_softmax * data_tr, dim=1))
            l2_reg = self.model.get_l2_reg()

            if self.anneal_step > 0 :
                anneal = min(self.anneal_cap, 1. * self.step / self.anneal_step)
            else :
                anneal = self.anneal_cap
            
            loss = neg_ll + anneal * kl + l2_reg
            
            print(epoch, batch_idx, loss.item(), anneal, self.step, self.optimizer.param_groups[0]['lr'])
            print(neg_ll.cpu().detach().numpy(), KL.cpu().detach().numpy(), l2_reg.cpu().detach().numpy()/2)
            print('==================================')

            #backward pass
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            if self.interval_valid > 0 and (self.step + 1) % self.interval_valid == 0 :
                print("CALLING VALID train ", self.step)
                validate(epoch)
    
    def validate(epoch, cmd='valid') :
        # mark model as evaluating
        self.model.eval()
        
        if cmd == 'valid' :
            loader = self.val_loader
        else :
            loader = self.test_loader

        n20, n100, r20, r100 = [], [], [], []

        for batch_id, (data_tr, data_te, prof) in tqdm(enumerate(loader)) :
            
            if self.cuda :
                data_tr = data_tr.cuda()
                prof = prof.cuda()
            
            data_tr = torch.autograd.Variable(data_tr)
            prof = torch.autograd.Variable(prof)

            with torch.no_grad() :
                logits, KL, mu_q, std_q, epsilon, sampled_z = self.model(data_tr, prof)

                pred_val = logits.cpu().detach().numpy()
                pred_val[data_tr.cpu().detach().numpy().nonzero()] = -np.inf
                
                n20.append(utils.NDCG_binary_at_k_batch(pred_val, data_te.numpy(), k=20))
                n100.append(utils.NDCG_binary_at_k_batch(pred_val, data_te.numpy(), k=100))
                r20.append(utils.Recall_at_k_batch(pred_val, data_te.numpy(), k=20))
                r50.append(utils.Recall_at_k_batch(pred_val, data_te.numpy(), k=50))
        
        n20 = np.concatenate(n20, axis=0)
        n100 = np.concatenate(n100, axis=0)
        r20 = np.concatenate(r20, axis=0)
        r50 = np.concatenate(r50, axis=0)

        if cmd == 'valid' :
            self.n20_max_va = max(self.n20_max_va, n20.mean())
            self.n100_max_va = max(self.n100_max_va, n100.mean())
            self.r20_max_va = max(self.r20_max_va, r20.mean())
            self.r50_max_va = max(self.r50_max_va, r50.mean())
            max_metrics = "{},{},{},{:.5f},{:.5f},{:.5f},{:.5f}".format(
                cmd, epoch, self.step, 
                self.n20_max_va, self.n100_max_va, 
                self.r20_max_va, self.r50_max_va
            )
        else :
            self.n20_max_te = max(self.n20_max_te, n20.mean())
            self.n100_max_te = max(self.n100_max_te, n100.mean())
            self.r20_max_te = max(self.r20_max_te, r20.mean())
            self.r50_max_te = max(self.r50_max_te, r50.mean())
            max_metrics = "{},{},{},{:.5f},{:.5f},{:.5f},{:.5f}".format(
                cmd, epoch, self.step, 
                self.n20_max_te, self.n100_max_te, 
                self.r20_max_te, self.r50_max_te
            )

        metrics = []
        metrics.append(max_metrics)
        metrics.append("NDCG@20,{:.5f},{:.5f}".format(np.mean(n20), np.std(n20) / np.sqrt(len(n20))))
        metrics.append("NDCG@100,{:.5f},{:.5f}".format(np.mean(n100), np.std(n100) / np.sqrt(len(n100))))
        metrics.append("Recall@20,{:.5f},{:.5f}".format(np.mean(r20_list), np.std(r20) / np.sqrt(len(r20))))
        metrics.append("Recall@50,{:.5f},{:.5f}".format(np.mean(r50_list), np.std(r50) / np.sqrt(len(r50))))
        print('\n' + ",".join(metrics))

        # return model to training
        self.model.train()