In [1]:
#!/usr/bin/env python
# coding: utf-8
"""
This is a example of MAG generation and data processing on the MovieLens dataset.
"""

import numpy as np
import csv
import pandas as pd
import random
import pickle
import copy
from scipy.sparse import csr_matrix

np.random.seed(2023)
random.seed(2023)

path = './ratings.dat'
reviews_df  = pd.read_csv(path,sep='::',header=None)
reviews_df.columns = ['user_id','item_id','rating','timestamp']
reviews_df.loc[:,'rating'] = reviews_df['rating'].map(lambda x: 1 if x >= 4 else 0)

# reindex of the IDs
def build_map(df, col_name):
    key = sorted(df[col_name].unique().tolist())
    m = dict(zip(key, range(1,len(key)+1)))
    df.loc[:,col_name] = df[col_name].map(lambda x: m[x])
    return m, key


uid_map, uid_key = build_map(reviews_df, 'user_id')


path = './movies.dat'
meta_df  = pd.read_csv(path,sep='::',header=None)
meta_df.columns = ['item_id','title','genres']
meta_df = meta_df[['item_id', 'genres']]
meta_df.loc[:,'genres'] = meta_df['genres'].map(lambda x: x.split('|')[0])

vid_map, vid_key = build_map(meta_df, 'item_id')
cat_map, cat_key = build_map(meta_df, 'genres')

user_count, item_count, cate_count, example_count =    len(uid_map), len(vid_map), len(cat_map), reviews_df.shape[0]
print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' %
      (user_count, item_count, cate_count, example_count))

meta_df = meta_df.sort_values('item_id')
meta_df = meta_df.reset_index(drop=True)

reviews_df['item_id'] = reviews_df['item_id'].map(lambda x: vid_map[x])
reviews_df = reviews_df.sort_values(['user_id', 'timestamp'])
reviews_df = reviews_df.reset_index(drop=True)

cate_list = [meta_df['genres'][i] for i in range(len(vid_map))]
cate_list = np.array(cate_list, dtype=np.int32)

cate_list = np.insert(cate_list, 0, 0)

with open('remap.pkl', 'wb') as f:
    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid, time(sorted)
    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line
    pickle.dump((user_count, item_count, cate_count, example_count),
              f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((vid_key, cat_key, uid_key), f, pickle.HIGHEST_PROTOCOL)


  reviews_df  = pd.read_csv(path,sep='::',header=None)
  meta_df  = pd.read_csv(path,sep='::',header=None)


user_count: 69878	item_count: 10681	cate_count: 20	example_count: 10000054


In [2]:
meta_df

Unnamed: 0,item_id,genres
0,1,3
1,2,3
2,3,6
3,4,6
4,5,6
...,...,...
10676,10677,3
10677,10678,7
10678,10679,6
10679,10680,9


In [3]:
reviews_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,583,1.0,838983339
1,1,229,1.0,838983392
2,1,314,1.0,838983392
3,1,326,1.0,838983392
4,1,290,1.0,838983421
...,...,...,...,...
10000049,69878,1753,0.0,912649171
10000050,69878,258,1.0,912649271
10000051,69878,1662,1.0,912649271
10000052,69878,882,1.0,912649403


In [4]:

pos_cnt, neg_cnt = 0, 0
for userId, hist in reviews_df.groupby('user_id'):
    movie_list = hist['item_id'].tolist()
    label_list = hist['rating'].tolist()

    pos_cnt += sum(label_list)
    neg_cnt += len(label_list) - sum(label_list)
    
train_hist_time, test_time = list(np.quantile(reviews_df.timestamp, [0.80, 0.90]))
train_df = reviews_df[reviews_df.timestamp <= test_time]
test_df = reviews_df[reviews_df.timestamp > test_time]

if train_df.shape[0]+test_df.shape[0] == reviews_df.shape[0]:
    print("Split Correct!")
else:
    print("Split Error!")

Split Correct!


In [5]:
train_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,583,1.0,838983339
1,1,229,1.0,838983392
2,1,314,1.0,838983392
3,1,326,1.0,838983392
4,1,290,1.0,838983421
...,...,...,...,...
10000049,69878,1753,0.0,912649171
10000050,69878,258,1.0,912649271
10000051,69878,1662,1.0,912649271
10000052,69878,882,1.0,912649403


In [6]:
# train_matrix = train_df.pivot(index='user_id', columns='item_id', values='rating')
train_matrix = np.zeros((user_count+1, item_count+1))

In [7]:
for row in train_df.to_numpy():
    train_matrix[int(row[0]),int(row[1])]=row[2]

In [8]:
train_matrix.shape

(69879, 10682)

In [10]:
train_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
import torch

In [12]:
import gc
gc.collect()
gc.collect(1)

0

In [13]:
torch.cuda.is_available()

True

In [14]:
train_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,583,1.0,838983339
1,1,229,1.0,838983392
2,1,314,1.0,838983392
3,1,326,1.0,838983392
4,1,290,1.0,838983421
...,...,...,...,...
10000049,69878,1753,0.0,912649171
10000050,69878,258,1.0,912649271
10000051,69878,1662,1.0,912649271
10000052,69878,882,1.0,912649403


In [15]:
import torch

In [16]:
train_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
U, S, Vh = np.linalg.svd(train_matrix, full_matrices=False)

In [18]:
U.shape

(69879, 10682)

In [19]:
S.shape

(10682,)

In [20]:
U.shape

(69879, 10682)

In [22]:
U,S,Vh

(array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [-9.05928990e-04,  4.22786238e-03,  6.57099872e-04, ...,
          1.42589714e-03, -4.58015249e-03,  1.27587293e-03],
        [-4.57438246e-04,  9.17833084e-04, -1.09604442e-04, ...,
          9.46840373e-04,  1.53807180e-03, -2.14491005e-03],
        ...,
        [-4.85495026e-03, -4.01276691e-03,  5.02843166e-03, ...,
          1.65278770e-03,  1.74995931e-04,  2.26325716e-03],
        [-1.74490998e-03,  4.87903342e-03,  3.76844723e-03, ...,
         -1.77750783e-04, -3.18936826e-04,  2.02789445e-04],
        [-1.57972013e-03,  1.63055804e-03, -1.46243965e-03, ...,
          5.40636413e-05,  2.12251862e-04,  6.38077149e-04]]),
 array([8.29463186e+02, 3.62466615e+02, 3.12374503e+02, ...,
        3.01144465e-15, 2.28341271e-15, 1.01450634e-15]),
 array([[ 0.00000000e+00, -8.82386812e-02, -1.93650281e-02, ...,
          0.00000000e+00,  0.00000000e+00,  0

In [23]:


user_train_df = train_df
user_train_df = user_train_df.reset_index(drop=True)

item_train_df = train_df.sort_values(['item_id', 'timestamp'])
item_train_df = item_train_df.reset_index(drop=True)

train_hist_df = reviews_df[reviews_df.timestamp <= train_hist_time]

pos_train_hist_df = train_hist_df.drop(train_hist_df[train_hist_df['rating']==0].index)
pos_train_df = train_df.drop(train_df[train_df['rating']==0].index)

recent_len = 20

pos_user_train_hist_dict = {}
recent_user_train_hist_dict = {}
for user_id, hist in pos_train_hist_df.groupby('user_id'):
    item_list = hist['item_id'].tolist()
    pos_user_train_hist_dict[user_id] = item_list
    recent_user_train_hist_dict[user_id] = item_list[-recent_len:]


pos_user_train_dict = {}
recent_user_train_dict = {}
for user_id, hist in pos_train_df.groupby('user_id'):
    item_list = hist['item_id'].tolist()
    pos_user_train_dict[user_id] = item_list
    recent_user_train_dict[user_id] = item_list[-recent_len:]


pos_item_train_hist_dict = {}
recent_item_train_hist_dict = {}
for item_id, hist in pos_train_hist_df.groupby('item_id'):
    user_list = hist['user_id'].tolist()
    pos_item_train_hist_dict[item_id] = user_list
    recent_item_train_hist_dict[item_id] = user_list[-recent_len:]


pos_item_train_dict = {}
recent_item_train_dict = {}
for item_id, hist in pos_train_df.groupby('item_id'):
    user_list = hist['user_id'].tolist()
    pos_item_train_dict[item_id] = user_list
    recent_item_train_dict[item_id] = user_list[-recent_len:]


train_eval_df = reviews_df[(reviews_df.timestamp > train_hist_time) & (reviews_df.timestamp <= test_time)]

train_hist_row = []
train_hist_col = []
for user in list(pos_user_train_hist_dict.keys()):
    for item in pos_user_train_hist_dict[user]:
        train_hist_row.append(user)
        train_hist_col.append(item)

train_hist_edge = np.ones(len(train_hist_row))
train_hist_row = np.array(train_hist_row)
train_hist_col = np.array(train_hist_col)
train_hist_mat = csr_matrix((train_hist_edge, (train_hist_row, train_hist_col)), shape=(user_count+1, item_count+1))

i_cluster_list = cate_list

train_hist_ic_row = []
train_hist_ic_col = []

for item in range(len(i_cluster_list)):
    train_hist_ic_row.append(item)
    train_hist_ic_col.append(i_cluster_list[item])

train_hist_ic_edge = np.ones(len(train_hist_ic_row))
train_hist_ic_row = np.array(train_hist_ic_row)
train_hist_ic_col = np.array(train_hist_ic_col)
train_hist_ic_mat = csr_matrix((train_hist_ic_edge, (train_hist_ic_row, train_hist_ic_col)), shape=(item_count+1, len(cat_map)+1))

train_hist_u_1ord_mat = train_hist_mat*train_hist_ic_mat
train_hist_u_1ord_mat_dense = train_hist_u_1ord_mat.todense()

In [24]:
dataframe = pd.read_csv('top10')

In [25]:
dataframe = dataframe.iloc[:,1:]

In [26]:
dataframe

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,2211,3030,3666,3925,8578,9501,9585,9896,10613,2009
1,56,60,125,991,1884,1960,2010,2057,2078,2079
2,7,39,64,68,92,117,121,128,185,235
3,11,17,52,58,93,194,222,230,279,348
4,19,38,54,65,69,87,88,101,103,108
...,...,...,...,...,...,...,...,...,...,...
10676,936,993,1059,1330,1337,1731,1804,1954,1967,1968
10677,1034,1167,1379,1383,1790,1991,2047,3677,4673,4821
10678,75,82,84,95,105,132,155,170,177,193
10679,28,35,46,49,74,85,104,130,139,178


In [27]:
dataframe.iloc[0].to_numpy()

array([ 2211,  3030,  3666,  3925,  8578,  9501,  9585,  9896, 10613,
        2009])

In [28]:



from sklearn import preprocessing
from sklearn.cluster import KMeans
train_hist_u_1ord_mat_normalized = preprocessing.normalize(np.asarray(train_hist_u_1ord_mat_dense), norm='l2')
cluster_fit = KMeans(n_clusters=20, random_state=0).fit(train_hist_u_1ord_mat_normalized)

with open('uc_cluster_kmeans.pkl', 'wb') as f:
    pickle.dump(cluster_fit.labels_, f, pickle.HIGHEST_PROTOCOL) # uid, iid, time(sorted)

with open('./uc_cluster_kmeans.pkl', 'rb') as f:
    u_cluster_list = pickle.load(f, encoding='latin1')


train_hist_uc_row = []
train_hist_uc_col = []

for user in range(len(u_cluster_list)):
    train_hist_uc_row.append(user)
    train_hist_uc_col.append(u_cluster_list[user])

train_hist_uc_edge = np.ones(len(train_hist_uc_row))
train_hist_uc_row = np.array(train_hist_uc_row)
train_hist_uc_col = np.array(train_hist_uc_col)
train_hist_uc_mat = csr_matrix((train_hist_uc_edge, (train_hist_uc_row, train_hist_uc_col)), shape=(user_count+1, len(set(u_cluster_list))))

train_hist_u_2ord_mat = train_hist_mat*(train_hist_mat.T*train_hist_uc_mat)
train_hist_i_2ord_mat = train_hist_mat.T*(train_hist_mat*train_hist_ic_mat)

train_hist_u_2ord_mat_dense = train_hist_u_2ord_mat.todense()
train_hist_i_2ord_mat_dense = train_hist_i_2ord_mat.todense()

train_hist_u_1ord_mat = train_hist_mat*train_hist_ic_mat
train_hist_i_1ord_mat = train_hist_mat.T*train_hist_uc_mat

train_hist_u_1ord_mat_dense = train_hist_u_1ord_mat.todense()
train_hist_i_1ord_mat_dense = train_hist_i_1ord_mat.todense()

train_hist_u_1ord_mat_dense_arr = train_hist_u_1ord_mat_dense.A
train_hist_u_2ord_mat_dense_arr = train_hist_u_2ord_mat_dense.A
train_hist_i_1ord_mat_dense_arr = train_hist_i_1ord_mat_dense.A
train_hist_i_2ord_mat_dense_arr = train_hist_i_2ord_mat_dense.A

train_eval_df = train_eval_df.reset_index(drop=True)

train_data = []
for idx, row in train_eval_df.iterrows():
    if idx % 100000 == 0:
        print("now have processed %d"%idx)
    now_user = row[0]
    now_item = row[1]
    now_user = int(now_user)
    now_item = int(now_item)
    if (now_user not in pos_user_train_hist_dict.keys()) or (now_item not in pos_item_train_hist_dict.keys()):
        continue
    now_label = row[2]
    now_user_1hop = train_hist_u_1ord_mat_dense_arr[now_user]
    now_user_2hop = train_hist_u_2ord_mat_dense_arr[now_user]
    now_item_1hop = train_hist_i_1ord_mat_dense_arr[now_item]
    now_item_2hop = train_hist_i_2ord_mat_dense_arr[now_item]
    user_recent = []
    user_recent.extend(recent_user_train_hist_dict[now_user])
    if len(user_recent) < recent_len:
        pad = [0 for i in range(recent_len-len(user_recent))]
        user_recent.extend(pad)
    item_recent = []
    item_recent.extend(recent_item_train_hist_dict[now_item])
    if len(item_recent) < recent_len:
        pad = [0 for i in range(recent_len-len(item_recent))]
        item_recent.extend(pad)
    now_train_seq = np.concatenate([np.array([now_user]), now_user_1hop, now_user_2hop, user_recent, np.array([now_item]), now_item_1hop, now_item_2hop, item_recent, np.array([now_label]), dataframe.iloc[now_item-1].to_numpy()], axis=0)
    train_data.append(now_train_seq)


train_data = np.array(train_data)
train_row = []
train_col = []
for user in list(pos_user_train_dict.keys()):
    for item in pos_user_train_dict[user]:
        train_row.append(user)
        train_col.append(item)

train_edge = np.ones(len(train_row))
train_row = np.array(train_row)
train_col = np.array(train_col)
train_mat = csr_matrix((train_edge, (train_row, train_col)), shape=(user_count+1, item_count+1))


train_uc_row = []
train_uc_col = []

train_ic_row = []
train_ic_col = []

for user in range(len(u_cluster_list)):
    train_uc_row.append(user)
    train_uc_col.append(u_cluster_list[user])

train_uc_edge = np.ones(len(train_uc_row))
train_uc_row = np.array(train_uc_row)
train_uc_col = np.array(train_uc_col)

for item in range(len(i_cluster_list)):
    train_ic_row.append(item)
    train_ic_col.append(i_cluster_list[item])

train_ic_edge = np.ones(len(train_ic_row))
train_ic_row = np.array(train_ic_row)
train_ic_col = np.array(train_ic_col)

train_uc_mat = csr_matrix((train_uc_edge, (train_uc_row, train_uc_col)), shape=(user_count+1, len(set(u_cluster_list))))
train_ic_mat = csr_matrix((train_ic_edge, (train_ic_row, train_ic_col)), shape=(item_count+1, len(cat_map)+1))

train_u_2ord_mat = train_mat*(train_mat.T*train_uc_mat)
train_i_2ord_mat = train_mat.T*(train_mat*train_ic_mat)

train_u_2ord_mat_dense = train_u_2ord_mat.todense()
train_i_2ord_mat_dense = train_i_2ord_mat.todense()

train_u_1ord_mat = train_mat*train_ic_mat
train_i_1ord_mat = train_mat.T*train_uc_mat

train_u_1ord_mat_dense = train_u_1ord_mat.todense()
train_i_1ord_mat_dense = train_i_1ord_mat.todense()

train_u_1ord_mat_dense_arr = train_u_1ord_mat_dense.A
train_u_2ord_mat_dense_arr = train_u_2ord_mat_dense.A
train_i_1ord_mat_dense_arr = train_i_1ord_mat_dense.A
train_i_2ord_mat_dense_arr = train_i_2ord_mat_dense.A


test_df = test_df.reset_index(drop=True)

test_data = []
for idx, row in test_df.iterrows():
    if idx % 100000 == 0:
        print("now have processed %d"%idx)
    now_user = row[0]
    now_item = row[1]
    now_user = int(now_user)
    now_item = int(now_item)
    if (now_user not in pos_user_train_dict.keys()) or (now_item not in pos_item_train_dict.keys()):
        continue
    now_label = row[2]
    now_user_1hop = train_u_1ord_mat_dense_arr[now_user]
    now_user_2hop = train_u_2ord_mat_dense_arr[now_user]
    now_item_1hop = train_i_1ord_mat_dense_arr[now_item]
    now_item_2hop = train_i_2ord_mat_dense_arr[now_item]
    user_recent = []
    user_recent.extend(recent_user_train_dict[now_user])
    if len(user_recent) < recent_len:
        pad = [0 for i in range(recent_len-len(user_recent))]
        user_recent.extend(pad)
    item_recent = []
    item_recent.extend(recent_item_train_dict[now_item])
    if len(item_recent) < recent_len:
        pad = [0 for i in range(recent_len-len(item_recent))]
        item_recent.extend(pad)
    now_test_seq = np.concatenate([np.array([now_user]), now_user_1hop, now_user_2hop, user_recent, np.array([now_item]), now_item_1hop, now_item_2hop, item_recent, np.array([now_label]), dataframe.iloc[now_item-1]], axis=0)
    test_data.append(now_test_seq)


test_data = np.array(test_data)
test_data.shape

u_cluster_num = len(set(u_cluster_list))
i_cluster_num = len(cat_map)




now have processed 0


  now_user = row[0]
  now_item = row[1]
  now_label = row[2]


now have processed 100000
now have processed 200000
now have processed 300000
now have processed 400000
now have processed 500000
now have processed 600000
now have processed 700000
now have processed 800000
now have processed 900000
now have processed 1000000
now have processed 0


  now_user = row[0]
  now_item = row[1]
  now_label = row[2]


now have processed 100000
now have processed 200000
now have processed 300000
now have processed 400000
now have processed 500000
now have processed 600000
now have processed 700000
now have processed 800000
now have processed 900000
now have processed 1000000


In [29]:
with open('../ml-10m.pkl', 'wb') as f:
    pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(test_data, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(U, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(Vh, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(u_cluster_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(i_cluster_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((user_count, item_count, cate_count, u_cluster_num, i_cluster_num), f, pickle.HIGHEST_PROTOCOL)