In [1]:
import numpy as np
import torch
from matplotlib import pyplot as plt
import random

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(0)

In [3]:
data_name = 'ciao'
type = 'mf'
emb_dim = 20
core = 10

user_emb_mse_0 = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{type}_user_emb.txt')
item_emb_mse_0 = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{type}_item_emb.txt')

print('user_emb_mse_0', user_emb_mse_0, user_emb_mse_0.shape)
print('item_emb_mse_0', item_emb_mse_0, item_emb_mse_0.shape)


user_emb_mse_0 [[-0.30584052  0.31445277 -0.32085735 ...  0.38600147 -0.33172542
   0.36452895]
 [-0.37772697  0.43621007 -0.43697435 ...  0.44851348 -0.43678746
   0.43986902]
 [-0.53812033  0.52454209 -0.40070242 ...  0.44782859 -0.52930433
   0.49370912]
 ...
 [-0.35335532  0.45030597 -0.46539438 ...  0.39491549 -0.48133031
   0.45048186]
 [-0.56421506  0.42175665 -0.51859009 ...  0.58428836 -0.50152475
   0.54699063]
 [-0.52599502  0.55503333 -0.54048157 ...  0.55491346 -0.53933436
   0.5073179 ]] (12168, 20)
item_emb_mse_0 [[-0.44163632  0.36362165 -0.44438291 ...  0.44159007 -0.40201563
   0.41600874]
 [-0.51702827  0.55144864 -0.5751915  ...  0.58056724 -0.54933715
   0.57743794]
 [-0.26735637  0.31256163 -0.3312723  ...  0.28288558 -0.28170329
   0.32355788]
 ...
 [-0.25085846  0.29776239 -0.37920755 ...  0.29199412 -0.36623648
   0.32842368]
 [-0.54501808  0.49262151 -0.54724216 ...  0.57418019 -0.53476775
   0.46674618]
 [-0.47938412  0.51870382 -0.38730419 ...  0.56291556 -0

In [4]:
rating = np.matmul(user_emb_mse_0, item_emb_mse_0.T)

In [5]:
print('rating', rating, rating.shape)
print('rating[0]', rating[0], rating[0].max(), rating[0].min())

rating [[2.79188659 3.29754031 2.08250198 ... 1.59742743 3.17036708 3.44866888]
 [3.39963881 4.21646843 2.60880149 ... 2.28619839 4.11045544 4.26778243]
 [3.57152193 4.61355276 2.67696754 ... 2.45308259 4.28985024 4.35509722]
 ...
 [3.3405204  3.9298065  2.43234129 ... 2.10967855 4.11296801 3.98963189]
 [3.58720989 4.82405809 2.68270253 ... 2.92789754 4.86170722 4.33635009]
 [3.17551256 4.70217252 2.59247191 ... 3.28038786 4.52866465 3.98594138]] (12168, 11283)
rating[0] [2.79188659 3.29754031 2.08250198 ... 1.59742743 3.17036708 3.44866888] 3.8196557264455375 0.6819343443850602


In [6]:
rating.mean()

3.7430201281726387

In [7]:
rating_data = np.loadtxt(f'../../datasets/{data_name}/processed_rating{core}.txt').astype('int')

In [8]:
rating_data[:, 2].mean()

3.9085077056591357

In [9]:
def rating_out(user_index, item_index):
    user_emb = user_emb_mse_0[user_index] 
    item_emb = item_emb_mse_0[item_index]
    
    rating = (user_emb * item_emb).sum()
    return rating


In [10]:
rating_out(100, 100)

3.342150205363568

In [11]:
rating[100, 100]

3.3421502053635685

In [12]:
coat = np.loadtxt('../../datasets/coat/test.txt').astype('int')
print('coat', coat, coat.shape)

coat [[  0  12   4]
 [  0  17   3]
 [  0  74   4]
 ...
 [289 263   1]
 [289 273   1]
 [289 295   1]] (4640, 3)


In [13]:
coat_rating_bin = np.bincount(coat[:, 2])
print('coat_rating_bin', coat_rating_bin)

coat_rating_bin [   0 1879  899 1002  641  219]


In [14]:
coat_rating_prob = coat_rating_bin / coat_rating_bin.sum()
print('coat_rating_prob', coat_rating_prob)

coat_rating_prob [0.         0.4049569  0.19375    0.21594828 0.13814655 0.04719828]


In [15]:
yahoo = np.loadtxt('../../datasets/yahooR3/random.txt').astype('int')
print('yahoo', yahoo, yahoo.shape)

yahoo [[   0   48    1]
 [   0  125    1]
 [   0  137    1]
 ...
 [5399  685    1]
 [5399  783    4]
 [5399  806    3]] (54000, 3)


In [16]:
yahoo_rating_bin = np.bincount(yahoo[:, 2])
print('yahoo_rating_bin', yahoo_rating_bin)

yahoo_rating_bin [    0 28417 13064  7770  3371  1378]


In [17]:
yahoo_rating_prob = yahoo_rating_bin / yahoo_rating_bin.sum()
print('yahoo_rating_prob', yahoo_rating_prob)

yahoo_rating_prob [0.         0.52624074 0.24192593 0.14388889 0.06242593 0.02551852]


In [18]:
full_observed_rating_prob = (coat_rating_prob + yahoo_rating_prob) / 2
print('full_observed_rating_prob', full_observed_rating_prob, full_observed_rating_prob.shape)

full_observed_rating_prob [0.         0.46559882 0.21783796 0.17991858 0.10028624 0.0363584 ] (6,)


In [19]:
sort_index = np.unravel_index(np.argsort(rating, axis=None), rating.shape)
print('sort_index', sort_index)

sort_index (array([ 9107,  9107,  5662, ..., 10298, 10298, 10298], dtype=int64), array([ 7403,  3270,  7403, ...,  8607,  7939, 10154], dtype=int64))


In [20]:
rating[sort_index]

array([-0.42697264, -0.23321703, -0.18620327, ...,  6.19631384,
        6.19962802,  6.22653157])

In [21]:
cum_full_observed_rating_prob = np.cumsum(full_observed_rating_prob)
print('cum_full_observed_rating_prob', cum_full_observed_rating_prob)

cum_full_observed_rating_prob [0.         0.46559882 0.68343678 0.86335536 0.9636416  1.        ]


In [22]:
for i in range(5):
    print((sort_index[0][int(cum_full_observed_rating_prob[i]*len(sort_index[0])): int(cum_full_observed_rating_prob[i+1]*len(sort_index[0]))], sort_index[1][int(cum_full_observed_rating_prob[i]*len(sort_index[0])): int(cum_full_observed_rating_prob[i+1]*len(sort_index[0]))]))


(array([ 9107,  9107,  5662, ...,  5584,  1635, 10840], dtype=int64), array([ 7403,  3270,  7403, ...,  3171,  3371, 10462], dtype=int64))
(array([  980,  4896,  5441, ...,   920, 12101,  1984], dtype=int64), array([5603, 2789,  977, ..., 4147, 9641, 5665], dtype=int64))
(array([   21,  9597,  2009, ...,  5733,  5132, 11721], dtype=int64), array([ 6540,  2935,  1477, ..., 10926,  1497,  9126], dtype=int64))
(array([ 1756, 10655,  1357, ..., 10549,  8907,  6571], dtype=int64), array([ 8938,  2924, 11164, ..., 11098,  5777,  2739], dtype=int64))
(array([ 2886,   365,  1886, ..., 10298, 10298, 10298], dtype=int64), array([10112,  8415,  2995, ...,  8607,  7939, 10154], dtype=int64))


In [23]:
for i in range(5):
    rating[(sort_index[0][int(cum_full_observed_rating_prob[i]*len(sort_index[0])): int(cum_full_observed_rating_prob[i+1]*len(sort_index[0]))], sort_index[1][int(cum_full_observed_rating_prob[i]*len(sort_index[0])): int(cum_full_observed_rating_prob[i+1]*len(sort_index[0]))])] = i + 1


In [24]:
rating

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 3., 1., ..., 1., 2., 3.],
       [1., 4., 1., ..., 1., 3., 3.],
       ...,
       [1., 2., 1., ..., 1., 2., 2.],
       [1., 4., 1., ..., 1., 4., 3.],
       [1., 4., 1., ..., 1., 4., 2.]])

In [25]:
np.savetxt(f'../../datasets/{data_name}/{data_name}{core}{emb_dim}_{type}_full_rating.txt', rating, fmt='%d')

In [26]:
rating[(rating_data[:,0], rating_data[:, 1])].mean()

2.3011956959483606

In [27]:
rating.mean()

2.023967455708707