In [115]:
import numpy as np
import torch
import random
import pandas as pd
from scipy.special import expit

In [116]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(0)

In [117]:
data_name = 'ciao'
core = 10

In [118]:
rating_data = np.loadtxt(f'../../datasets/{data_name}/processed_rating{core}.txt').astype(int)
trust_network = np.loadtxt(f'../../datasets/{data_name}/processed_trust{core}.txt').astype(int)
num_users = rating_data[:, 0].max() + 1
num_items = rating_data[:, 1].max() + 1
num_rating_data = len(rating_data)
print('rating_data', rating_data, rating_data.shape)
print('trust_network', trust_network, trust_network.shape)
print('num_users', num_users)
print('num_items', num_items)
print('num_rating_data', num_rating_data)

rating_data [[    0     0     2]
 [    0     1     2]
 [    0     2     2]
 ...
 [12167  5241     5]
 [12167   852     5]
 [12167  5228     4]] (352598, 3)
trust_network [[    1  2275]
 [    1 12034]
 [    1  1922]
 ...
 [12165  4215]
 [12166  4816]
 [12167  3224]] (123187, 2)
num_users 12168
num_items 11283
num_rating_data 352598


In [119]:
rating_data_user_count = np.bincount(rating_data[:, 0])
rating_data_item_count = np.bincount(rating_data[:, 1])
print('user act', rating_data_user_count.min(), rating_data_user_count.max())
print('item pop', rating_data_item_count.min(), rating_data_item_count.max())

user act 10 1018
item pop 10 1512


In [120]:
#########ciao10
emb_dim = 20
alpha1 = 0.1
alpha2 = 0.2
alpha3 = 1.0 - alpha1 - alpha2
# alpha1 = 1.0
# alpha2 = 0.0
# alpha3 = 1.0 - alpha1 - alpha2

completion_type = 'full_mf'
num_all_data = num_users * num_items
sparsity = float(num_rating_data) / float(num_all_data)
uniform_pui = sparsity
indi_scale = 2.0
print('sparsity', sparsity)

user_emb = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{completion_type}_user_emb.txt')
item_emb = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{completion_type}_item_emb.txt')

rating_ui = np.matmul(user_emb, item_emb.T)
prefer_pui = expit((rating_ui-3.0)*indi_scale)
print('prefer_pui1', prefer_pui, prefer_pui.max(), prefer_pui.min())
prefer_pui = prefer_pui.reshape(1, -1)[0]
print('prefer_pui2', prefer_pui, prefer_pui.max(), prefer_pui.min())
beta = num_users * num_items * sparsity / prefer_pui.sum()
prefer_pui = prefer_pui * beta
print('prefer_pui', prefer_pui, prefer_pui.max(), prefer_pui.min())


In [121]:
# emb_dim = 20
# alpha1 = 0.2
# alpha2 = 0.65
# alpha3 = 0.15
# # alpha1 = 1.0
# # alpha2 = 0.0
# # alpha3 = 1.0 - alpha1 - alpha2
# 
# completion_type = 'full_mf'
# num_all_data = num_users * num_items
# sparsity = float(num_rating_data) / float(num_all_data)
# uniform_pui = sparsity
# indi_scale = 2.0
# print('sparsity', sparsity)
# 
# user_emb = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{completion_type}_user_emb.txt')
# item_emb = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{completion_type}_item_emb.txt')
# 
# rating_ui = np.matmul(user_emb, item_emb.T)
# prefer_pui = expit((rating_ui-3.0)*indi_scale)
# print('prefer_pui1', prefer_pui, prefer_pui.max(), prefer_pui.min())
# prefer_pui = prefer_pui.reshape(1, -1)[0]
# print('prefer_pui2', prefer_pui, prefer_pui.max(), prefer_pui.min())
# beta = num_users * num_items * sparsity / prefer_pui.sum()
# prefer_pui = prefer_pui * beta
# print('prefer_pui', prefer_pui, prefer_pui.max(), prefer_pui.min())


In [122]:
# emb_dim = 20
# alpha1 = 0.2
# alpha2 = 0.6
# alpha3 = 0.2
# # alpha1 = 1.0
# # alpha2 = 0.0
# # alpha3 = 1.0 - alpha1 - alpha2

# completion_type = 'full_mf'
# num_all_data = num_users * num_items
# sparsity = float(num_rating_data) / float(num_all_data)
# uniform_pui = sparsity
# indi_scale = 2.0
# print('sparsity', sparsity)

# user_emb = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{completion_type}_user_emb.txt')
# item_emb = np.loadtxt(f'../para/{data_name}{core}{emb_dim}_{completion_type}_item_emb.txt')

# rating_ui = np.matmul(user_emb, item_emb.T)
# prefer_pui = expit((rating_ui-3.0)*indi_scale)
# print('prefer_pui1', prefer_pui, prefer_pui.max(), prefer_pui.min())
# prefer_pui = prefer_pui.reshape(1, -1)[0]
# print('prefer_pui2', prefer_pui, prefer_pui.max(), prefer_pui.min())
# beta = num_users * num_items * sparsity / prefer_pui.sum()
# prefer_pui = prefer_pui * beta
# print('prefer_pui', prefer_pui, prefer_pui.max(), prefer_pui.min())


sparsity 0.0025682426588486762
prefer_pui1 [[0.02533876 0.01957742 0.01588365 ... 0.01588365 0.01940156 0.01764607]
 [0.01135168 0.42463093 0.01829775 ... 0.01829775 0.15816636 0.55260702]
 [0.02175605 0.79967196 0.017429   ... 0.017429   0.50106652 0.39368718]
 ...
 [0.01240548 0.11457519 0.01855035 ... 0.01855036 0.21174962 0.08599329]
 [0.02803675 0.94405845 0.01787913 ... 0.01787914 0.95465401 0.44128501]
 [0.01460277 0.9063856  0.01871988 ... 0.01871987 0.75303424 0.11448891]] 0.999449803359099 0.00356426736462196
prefer_pui2 [0.02533876 0.01957742 0.01588365 ... 0.01871987 0.75303424 0.11448891] 0.999449803359099 0.00356426736462196
prefer_pui [0.00026451 0.00020437 0.00016581 ... 0.00019542 0.00786092 0.00119515] 0.010433253046555062 3.720737471326283e-05


In [123]:
base_num_users_wrt_uni_pui = int(alpha1 * uniform_pui * num_all_data // num_users)
incre_num_users_wrt_uni_pui = int(alpha1 * uniform_pui * num_all_data % num_users)
users_wrt_uni_pui = [np.repeat(np.arange(num_users), base_num_users_wrt_uni_pui), np.random.choice(np.arange(num_users), size=incre_num_users_wrt_uni_pui, replace=False).astype(int)]
users_wrt_uni_pui = np.hstack(users_wrt_uni_pui)
np.random.shuffle(users_wrt_uni_pui)

base_num_items_wrt_uni_pui = int(alpha1 * uniform_pui * num_all_data // num_items)
incre_num_items_wrt_uni_pui = int(alpha1 * uniform_pui * num_all_data % num_items)
items_wrt_uni_pui = [np.repeat(np.arange(num_items), base_num_items_wrt_uni_pui), np.random.choice(np.arange(num_items), size=incre_num_items_wrt_uni_pui, replace=False).astype(int)]
print(items_wrt_uni_pui)
items_wrt_uni_pui = np.hstack(items_wrt_uni_pui)
np.random.shuffle(items_wrt_uni_pui)
data_wrt_uni_pui = np.hstack((users_wrt_uni_pui.reshape(-1, 1), items_wrt_uni_pui.reshape(-1, 1)))
print('data_wrt_uni_pui', data_wrt_uni_pui, data_wrt_uni_pui.shape, np.bincount(data_wrt_uni_pui[:, 0]).min(), np.bincount(data_wrt_uni_pui[:, 0]).max(), np.bincount(data_wrt_uni_pui[:, 1]).min(), np.bincount(data_wrt_uni_pui[:, 1]).max())

[array([    0,     0,     0, ..., 11282, 11282, 11282]), array([ 6801,  8206,   804, ...,  3931, 11145,  2038])]
data_wrt_uni_pui [[ 1751  2017]
 [ 4643  9945]
 [ 5112  1131]
 ...
 [ 6159 10029]
 [10985  4291]
 [ 8556  5734]] (70519, 2) 5 6 6 7


In [124]:
index_wrt_uni_pui = data_wrt_uni_pui[:, 0] * num_items + data_wrt_uni_pui[:, 1]
prefer_pui_for_choice = prefer_pui.copy()
prefer_pui_for_choice[index_wrt_uni_pui] = 0.0
prefer_pui_for_choice = prefer_pui_for_choice / prefer_pui_for_choice.sum()
print('len(np.arange(num_all_data))', len(np.arange(num_all_data)), len(prefer_pui_for_choice))
index_wrt_pref_pui = np.random.choice(np.arange(num_all_data), size=int(alpha2*prefer_pui.sum()), replace=False, p=prefer_pui_for_choice)

user_wrt_pref_pui = index_wrt_pref_pui // num_items
item_wrt_pref_pui = index_wrt_pref_pui % num_items
print('index_wrt_pref_pui', index_wrt_pref_pui)
data_wrt_pref_pui = np.hstack((user_wrt_pref_pui.reshape(-1, 1), item_wrt_pref_pui.reshape(-1, 1)))
print('data_wrt_pref_pui', data_wrt_pref_pui, data_wrt_pref_pui.shape)

len(np.arange(num_all_data)) 137291544 137291544
index_wrt_pref_pui [ 84386340  70865644  15684085 ...   6360619 118326881  73039835]
data_wrt_pref_pui [[ 7479   783]
 [ 6280  8404]
 [ 1390   715]
 ...
 [  563  8290]
 [10487  2060]
 [ 6473  4976]] (211558, 2)


In [125]:
temp_gen_data = np.vstack((data_wrt_uni_pui, data_wrt_pref_pui))

In [126]:
count_items_user_neighs = 0
item_list_per_user_neighs = []
target_user_list_per_user_neighs = []
for u in range(num_users):
    neighs = trust_network[:, 1][trust_network[:, 0]==u]
    if len(neighs) != 0:
        temp_item_list_per_user_neighs = []
        for v in neighs:
            temp_item_list_per_user_neighs.extend(temp_gen_data[:, 1][temp_gen_data[:, 0]==v])
        temp_item_list_per_user_neighs = np.array(temp_item_list_per_user_neighs)
        temp_item_list_per_user_neighs = np.unique(temp_item_list_per_user_neighs)
        item_list_per_user_neighs.append(temp_item_list_per_user_neighs)
        target_user_list_per_user_neighs.append(np.ones_like(temp_item_list_per_user_neighs, dtype=int)*u)
    else:
        item_list_per_user_neighs.append(np.empty(0))
    count_items_user_neighs = count_items_user_neighs + len(item_list_per_user_neighs[u])
item_list_per_user_neighs = np.hstack(item_list_per_user_neighs)
target_user_list_per_user_neighs = np.hstack(target_user_list_per_user_neighs)
print('item_list_per_user_neighs', item_list_per_user_neighs)
print('target_user_list_per_user_neighs', target_user_list_per_user_neighs)
print('count_items_user_neighs', count_items_user_neighs)

item_list_per_user_neighs [   27.   162.   201. ...  9063.  9182. 10753.]
target_user_list_per_user_neighs [    1     1     1 ... 12167 12167 12167]
count_items_user_neighs 2561290


In [127]:
neigh_pui = num_rating_data / count_items_user_neighs
print('neigh_pui', neigh_pui)
index_wrt_neigh_pui = np.random.choice(np.arange(count_items_user_neighs), size=int(alpha3*neigh_pui*count_items_user_neighs), replace=False)
user_wrt_neigh_pui = target_user_list_per_user_neighs[index_wrt_neigh_pui]
item_wrt_neigh_pui = item_list_per_user_neighs[index_wrt_neigh_pui]
data_wrt_neigh_pui = np.hstack((user_wrt_neigh_pui.reshape(-1, 1), item_wrt_neigh_pui.reshape(-1, 1)))
print('data_wrt_neigh_p_ui', data_wrt_neigh_pui, data_wrt_neigh_pui.shape)

neigh_pui 0.13766422388718186
data_wrt_neigh_p_ui [[1143. 3422.]
 [9846. 9975.]
 [1517. 3028.]
 ...
 [5675. 6871.]
 [ 790. 4204.]
 [5622. 1787.]] (70519, 2)


In [128]:
gen_data = np.vstack((temp_gen_data, data_wrt_neigh_pui))
gen_data = gen_data.astype(int)

In [129]:
gen_data_df = pd.DataFrame(gen_data, columns=['user_id', 'item_id'])
gen_data_unique = gen_data_df.drop_duplicates(subset=['user_id', 'item_id'])
gen_data = gen_data_unique.to_numpy()
print('gen_data', gen_data, gen_data.shape)


gen_data [[1751 2017]
 [4643 9945]
 [5112 1131]
 ...
 [5675 6871]
 [ 790 4204]
 [5622 1787]] (352383, 2)


In [130]:
user_act = np.bincount(gen_data[:, 0], minlength=num_users)
item_pop = np.bincount(gen_data[:, 1], minlength=num_items)
print('user_act', user_act)
print('item_pop', item_pop)

user_act [ 6 18 17 ... 21 30 13]
item_pop [12 61 10 ... 12 52 55]


In [131]:
user_act.max()

200

In [132]:
user_act.min()

5

In [133]:
item_pop.max()

111

In [134]:
item_pop.min()

6

In [135]:
np.savetxt(f'../../datasets/{data_name}/{data_name}{core}{emb_dim}{alpha1}{alpha2}{alpha3}_{completion_type}_exposed_data.txt', gen_data, fmt='%d')