In [23]:
import numpy as np
import scipy.io as scio
import pandas as pd
import torch
import random

In [24]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(0)

In [25]:
data_name = 'ciao'
core = 10

In [26]:
if data_name == 'epinions' or data_name == 'ciao':
    rating_data_mat = scio.loadmat(f'../../datasets/{data_name}/rating.mat')
    trust_data_mat = scio.loadmat(f'../../datasets/{data_name}/trustnetwork.mat')
    #print('rating_data_mat', rating_data_mat)
    #print('trust_data_mat', trust_data_mabt)

    rating_data = rating_data_mat['rating'].astype('long')
    rating_data = rating_data[:, [0, 1, 3]]
    trust_data = trust_data_mat['trustnetwork'].astype('long')

elif data_name == 'filmtrust':
    rating_data = np.loadtxt(f'../../datasets/{data_name}/ratings.txt')
    trust_data = np.loadtxt(f'../../datasets/{data_name}/trust.txt').astype('int')
    trust_data = trust_data[:, [0, 1]]


print('rating_data', rating_data, rating_data.shape)
print('trust_data', trust_data, trust_data.shape)
print((trust_data[:,0]==trust_data[:,1]).sum())
print(np.where(trust_data[:, 0]==trust_data[:,1]))
print(np.argwhere(trust_data[:, 0]==trust_data[:, 1]))

trust_data = np.delete(trust_data, np.argwhere(trust_data[:, 0]==trust_data[:, 1]).reshape(1, -1)[0], 0)
print((trust_data[:,0]==trust_data[:,1]).sum())
print('trust_data', trust_data, trust_data.shape)





rating_data [[    1     1     2]
 [    1     2     2]
 [    1     3     2]
 ...
 [22166 43538     5]
 [22166 38711     4]
 [22166 41790     3]] (922267, 3)
trust_data [[15373  9831]
 [ 4247  9831]
 [ 4644  9831]
 ...
 [13181 15645]
 [  897  8000]
 [ 8000   897]] (355754, 2)
224
(array([  1118,   2625,   3690,   6392,  10245,  12065,  13397,  15817,
        19127,  22816,  25560,  30113,  32514,  32973,  35371,  39877,
        44746,  46606,  48898,  52984,  57827,  68059,  70712,  72358,
        72809,  72922,  74135,  82883,  84110,  87138,  95457,  97295,
        97599, 107268, 107484, 110258, 119254, 123667, 125582, 126142,
       126278, 126992, 127235, 130044, 131075, 134137, 134684, 135505,
       137319, 137650, 137863, 138070, 143341, 143471, 149046, 149701,
       149759, 149929, 150159, 150380, 150525, 152332, 152948, 153432,
       153502, 156511, 156637, 158670, 159047, 159589, 159802, 160700,
       161884, 164373, 164932, 168502, 169586, 172927, 174288, 174651,
       175

In [27]:

df_trust_for_unique = pd.DataFrame(trust_data, columns=['user_id1', 'user_id2'])


df_trust_unique = df_trust_for_unique.drop_duplicates(subset=['user_id1', 'user_id2'])

trust_data = df_trust_unique.to_numpy()

print(trust_data, trust_data.shape)

[[15373  9831]
 [ 4247  9831]
 [ 4644  9831]
 ...
 [13181 15645]
 [  897  8000]
 [ 8000   897]] (355503, 2)


In [28]:

df_rating_for_unique = pd.DataFrame(rating_data, columns=['user_id', 'item_id', 'value'])


df_unique = df_rating_for_unique.drop_duplicates(subset=['user_id', 'item_id'])


rating_data = df_unique.to_numpy()

print(rating_data, rating_data.shape)

[[    1     1     2]
 [    1     2     2]
 [    1     3     2]
 ...
 [22166 43538     5]
 [22166 38711     4]
 [22166 41790     3]] (912441, 3)


In [29]:
df = pd.DataFrame(rating_data, columns=['user', 'item', 'rating'])
print('df', df)
while True:

    user_counts = df['user'].value_counts()

    item_counts = df['item'].value_counts()

    valid_users = user_counts[user_counts >= core].index
    valid_items = item_counts[item_counts >= core].index


    filtered_df = df[df['user'].isin(valid_users)]
    filtered_df = filtered_df[filtered_df['item'].isin(valid_items)]

    if filtered_df['user'].value_counts().min() == core and filtered_df['item'].value_counts().min() == core:
        break

    df = filtered_df

rating_data = filtered_df.values
print('rating_data', rating_data, rating_data.shape)


df          user   item  rating
0           1      1       2
1           1      2       2
2           1      3       2
3           1      4       5
4           1      5       3
...       ...    ...     ...
912436  22166  83922       4
912437  22166  23442       4
912438  22166  43538       5
912439  22166  38711       4
912440  22166  41790       3

[912441 rows x 3 columns]
rating_data [[    1     1     2]
 [    1     2     2]
 [    1     3     2]
 ...
 [22164  1628     5]
 [22164 17247     4]
 [22164 17137     4]] (352598, 3)


In [30]:
df_check = pd.DataFrame(rating_data, columns=['user', 'item', 'rating'])
user_counts = df_check['user'].value_counts()
item_counts = df_check['item'].value_counts()
print('user_counts', user_counts)
print('item_counts', item_counts)

user_counts 15827    1018
2760      788
5642      698
20241     680
2034      586
         ... 
13037      10
13023      10
13022      10
13020      10
22164      10
Name: user, Length: 12168, dtype: int64
item_counts 122      1512
633      1101
51       1015
666       778
386       697
         ... 
50927      10
50894      10
8616       10
50718      10
28249      10
Name: item, Length: 11283, dtype: int64


In [31]:
max_user = int(max(np.max(trust_data), np.max(rating_data[:,0])))
trust_users_flag = np.zeros(max_user+1)
trust_users_flag[np.unique(trust_data[:, 1])] = 1.0
trust_users_flag[np.unique(trust_data[:, 0])] = 1.0
print('trust_users_flag', trust_users_flag, trust_users_flag.sum())

trust_users_flag [0. 1. 1. ... 1. 1. 1.] 18069.0


In [32]:
trust_unique_users = np.nonzero(trust_users_flag)[0]
trust_unique_users_check = np.unique(trust_data.reshape(1, -1)[0])
print('trust_unique_users', trust_unique_users, trust_unique_users_check, (trust_unique_users_check == trust_unique_users).sum())

trust_unique_users [    1     2     4 ... 22164 22165 22166] [    1     2     4 ... 22164 22165 22166] 18069


In [33]:
rating_users_flag = np.zeros(max_user+1)
rating_users_flag[rating_data[:, 0].astype('int')] = 1.0
print('rating_users_flag', rating_users_flag, rating_users_flag.sum())


rating_users_flag [0. 1. 1. ... 1. 0. 0.] 12168.0


In [34]:
rating_unique_users = np.unique(rating_data[:,0])
rating_unique_users_check = np.nonzero(rating_users_flag)
print('rating_unique_users', rating_unique_users, rating_unique_users_check, (rating_unique_users==rating_unique_users_check).sum())

rating_unique_users [    1     2     3 ... 22156 22162 22164] (array([    1,     2,     3, ..., 22156, 22162, 22164], dtype=int64),) 12168


In [35]:
for tid in trust_unique_users:
    if rating_users_flag[tid] == 0:
        trust_data = np.delete(trust_data, np.argwhere(trust_data[:, 0]==tid).reshape(1, -1)[0], 0)
        trust_data = np.delete(trust_data, np.argwhere(trust_data[:, 1]==tid).reshape(1, -1)[0], 0)

print('trust_data', trust_data, trust_data.shape)


trust_data [[ 4644  5369]
 [12037  5369]
 [ 5493  5369]
 ...
 [ 6715  7812]
 [ 6525  7812]
 [13181 15645]] (123187, 2)


In [36]:
trust_users_flag2 = np.zeros(max_user+1)
trust_users_flag2[np.unique(trust_data[:, 1])] = 1
trust_users_flag2[np.unique(trust_data[:, 0])] = 1
print('trust_users_flag2', trust_users_flag2.sum())

rating_users_flag2 = np.zeros(max_user+1)
rating_users_flag2[np.unique(rating_data[:, 0]).astype('int')] = 1
print('rating_users_flag2', rating_users_flag2, rating_users_flag2.sum())

print((trust_users_flag2 * rating_users_flag2).sum())
print((trust_users_flag2 - rating_users_flag2).sum())



trust_users_flag2 9185.0
rating_users_flag2 [0. 1. 1. ... 1. 0. 0.] 12168.0
9185.0
-2983.0


In [37]:
rating_unique_users = np.unique(rating_data[:, 0])
for i in range(len(rating_unique_users)):
    origin_id = rating_unique_users[i]
    trust_data[:,0][np.where(trust_data[:,0] == origin_id)[0]] = i
    trust_data[:,1][np.where(trust_data[:,1] == origin_id)[0]] = i
    rating_data[:, 0][np.where(rating_data[:, 0]== origin_id)[0]] = i

In [38]:
rating_unique_items = np.unique(rating_data[:, 1])
for i in range(len(rating_unique_items)):
    origin_item_id = rating_unique_items[i]
    rating_data[:, 1][np.where(rating_data[:, 1]== origin_item_id)[0]] = i

In [39]:
rating_data = rating_data[np.argsort(rating_data[:,0])]
trust_data = trust_data[np.argsort(trust_data[:,0])]
print('rating_data', rating_data, rating_data.shape)
print('trust_data', trust_data, trust_data.shape)

rating_data [[    0     0     2]
 [    0     1     2]
 [    0     2     2]
 ...
 [12167  5241     5]
 [12167   852     5]
 [12167  5228     4]] (352598, 3)
trust_data [[    1  2275]
 [    1 12034]
 [    1  1922]
 ...
 [12165  4215]
 [12166  4816]
 [12167  3224]] (123187, 2)


In [40]:
print('trust max0', np.max(trust_data[:, 0]))
print('trust min0', np.min(trust_data[:, 0]))
print('trust max1', np.max(trust_data[:, 1]))
print('trust min1', np.min(trust_data[:, 1]))
print('unique trust0', len(np.unique(trust_data[:, 0])))
print('unique trust1', len(np.unique(trust_data[:, 1])))

trust max0 12167
trust min0 1
trust max1 12165
trust min1 0
unique trust0 7868
unique trust1 7768


In [41]:
print('rating max0', np.max(rating_data[:, 0]))
print('rating min0', np.min(rating_data[:, 0]))
print('rating max1', np.max(rating_data[:, 1]))
print('rating min1', np.min(rating_data[:, 1]))
print('unique rating0', len(np.unique(rating_data[:, 0])))
print('unique rating1', len(np.unique(rating_data[:, 1])))

rating max0 12167
rating min0 0
rating max1 11282
rating min1 0
unique rating0 12168
unique rating1 11283


In [42]:
sparsity = rating_data.shape[0] / len(np.unique(rating_data[:, 0])) / len(np.unique(rating_data[:, 1]))
print('sparsity', sparsity)

sparsity 0.002568242658848676


In [43]:
trust_sparsity = trust_data.shape[0] / len(np.unique(rating_data[:, 0])) / len(np.unique(rating_data[:, 0]))
print('trust_sparsity', trust_sparsity)

trust_sparsity 0.000832006035598055


In [44]:
np.savetxt(f'../../datasets/{data_name}/processed_rating{core}.txt', rating_data)
np.savetxt(f'../../datasets/{data_name}/processed_trust{core}.txt', trust_data)