In [1]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
import torch

from torch_sparse import SparseTensor, matmul

# Open hun data

In [2]:
df = pd.read_csv('./data/Beauty.csv').rename({"user_id":"u", "item_id":"i", "time":"t"}, axis=1)
df.head()
display(df)

Unnamed: 0,u,i,t
0,0,0,476496000
1,1,0,486432000
2,2,1,482803200
3,3,1,474422400
4,4,1,475372800
...,...,...,...
394903,15569,57287,496454400
394904,6783,57287,497232000
394905,35430,57288,496800000
394906,3542,57288,496886400


In [3]:
# Prep
def refine_time(data):
    """
    assures items bought by a user don't have the exact same time
    5, 1, 2, 2, 8 -> 1, 2, 3, 5, 8
    """
    
    data = data.sort_values(['t'], kind='mergesort')
    time_seq = data['t'].values
    time_gap = 1
    
    for i, da in enumerate(time_seq[0:-1]):
        if time_seq[i] == time_seq[i+1] or time_seq[i] > time_seq[i+1]:
            time_seq[i+1] = time_seq[i+1] + time_gap
            time_gap += 1
            
    data['t'] = time_seq
    
    return  data

def remove_less_than_n_transactions_users(dataf, n):
    transactions_per_customer = dataf['u'].value_counts()
    
    valid_customers = transactions_per_customer[transactions_per_customer>=n].index
    
    return dataf[dataf['u'].isin(valid_customers)]

print("Re-ordering and fixing time sequences...")
df = df.groupby('u').apply(refine_time).reset_index(drop=True)
df['t'] = df['t'].astype('int64')


# This does not work yet since the u's and i's need to be remapped to a continuous range again
# min_n = 5
# print(f"Removing users with less than {min_n} transactions")
# df = remove_less_than_n_transactions_users(df, min_n)


df

Re-ordering and fixing time sequences...


Unnamed: 0,u,i,t
0,0,12887,473731200
1,0,49582,475372800
2,0,0,476496000
3,0,4732,476496001
4,0,5760,476496002
...,...,...,...
394903,52201,57191,493689601
394904,52202,57190,493603200
394905,52202,57191,493603201
394906,52203,57277,490924800


In [4]:
min(df['u']), max(df['u'])

(0, 52203)

In [5]:
min(df['i']), max(df['i'])

(0, 57288)

In [6]:
# create inverted lists

# n is max number of recent transactions per node sampled
n = 10

# [[0, 0, 0, 0, 0, 0, 0, 0], # 0 is dummy
#  [i, i, i, i, i, i, 0, 0]]
u_connection_list = [np.zeros(n)]
u_transaction_list = [np.zeros(n)]
for u, us_transactions in df.groupby('u'):
    bought = us_transactions['i'].values[-n:]
    
    zero_padded = np.zeros(n)
    zero_padded[:len(bought)] = bought + 1 # offset by 1 for dummy
    
    u_connection_list.append(zero_padded)
    
    transaction_idx = us_transactions['i'].index.values[-n:]
    
    zero_padded_t = np.zeros(n)
    zero_padded_t[:len(transaction_idx)] = transaction_idx
    
    u_transaction_list.append(zero_padded_t)

print("Created user dictionaries")

i_connection_list = [np.zeros(n)]
i_transaction_list = [np.zeros(n)]
for i, is_transactions in df.groupby('i'):
    bought = is_transactions['u'].values[-n:]
    
    zero_padded = np.zeros(n)
    zero_padded[:len(bought)] = bought + 1 # offset by 1 for dummy
    
    i_connection_list.append(zero_padded)
    
    transaction_idx = is_transactions['u'].index.values[-n:]
    
    zero_padded_t = np.zeros(n)
    zero_padded_t[:len(transaction_idx)] = transaction_idx
    
    i_transaction_list.append(zero_padded_t)

print("Created item dictionaries")

u_connections = np.stack(u_connection_list).astype(np.int32)
u_transactions = np.stack(u_transaction_list).astype(np.int32)

i_connections = np.stack(i_connection_list).astype(np.int32)
i_transactions = np.stack(i_transaction_list).astype(np.int32)

print(f"Created database of {len(u_connections)} users and {len(i_connections)} items")

Created user dictionaries
Created item dictionaries
Created database of 52205 users and 57290 items


In [7]:
for u, u_transa in df.groupby('u'):
    if u < 41:
        continue
        
        
    bought = u_transa['i'].values[-n:]
    
    print(u_transactions)
    print(bought)
    
    break

[[     0      0      0 ...      0      0      0]
 [    32     33     34 ...     39     40     41]
 [    42     43     44 ...      0      0      0]
 ...
 [394902 394903      0 ...      0      0      0]
 [394904 394905      0 ...      0      0      0]
 [394906 394907      0 ...      0      0      0]]
[ 8065 21024 21341  1479 54757    17]


In [8]:
df

Unnamed: 0,u,i,t
0,0,12887,473731200
1,0,49582,475372800
2,0,0,476496000
3,0,4732,476496001
4,0,5760,476496002
...,...,...,...
394903,52201,57191,493689601
394904,52202,57190,493603200
394905,52202,57191,493603201
394906,52203,57277,490924800


In [9]:
u_transactions

array([[     0,      0,      0, ...,      0,      0,      0],
       [    32,     33,     34, ...,     39,     40,     41],
       [    42,     43,     44, ...,      0,      0,      0],
       ...,
       [394902, 394903,      0, ...,      0,      0,      0],
       [394904, 394905,      0, ...,      0,      0,      0],
       [394906, 394907,      0, ...,      0,      0,      0]])

In [10]:
df.loc[u_transactions[42]]

Unnamed: 0,u,i,t
377,41,8065,449798400
378,41,21024,463968000
379,41,21341,463968001
380,41,1479,472953600
381,41,54757,485308800
382,41,17,494121600
0,0,12887,473731200
0,0,12887,473731200
0,0,12887,473731200
0,0,12887,473731200


# Make Data

In [11]:
all_users = df['u'].unique()
f"There are {len(all_users)} users"

'There are 52204 users'

In [12]:
def get_user_network(index, m=2):
    # our final subgraph
    u_m = np.array([0]) # 0 is dummy
    i_m = np.array([0])
    
    # our final transactions
    transactions_m = np.array([0])
    
    # currently sampling
    u_temp = np.array([index+1])
    
    i_temp = u_connections[u_temp]
    
    new_transactions = u_transactions[u_temp].flatten()
    transactions_m = np.union1d(transactions_m, new_transactions)
        
    for j in range(m):
        new_users = np.unique(i_connections[i_temp])
        u_temp = np.union1d(u_temp, new_users)
        
        new_transactions = i_transactions[i_temp].flatten()
        transactions_m = np.union1d(transactions_m, new_transactions)
        
        u_temp = np.setdiff1d(u_temp, u_m, assume_unique=True)
        u_m = np.union1d(u_m, u_temp)
        
        if len(u_temp)==0:
            break
            
        new_items = np.unique(u_connections[u_temp])
        i_temp = np.union1d(i_temp, new_items)
        
        new_transactions = u_transactions[u_temp].flatten()
        transactions_m = np.union1d(transactions_m, new_transactions)
        
        i_temp = np.setdiff1d(i_temp, i_m, assume_unique=True)
        i_m = np.union1d(i_temp, i_m)
        
        if len(i_temp)==0:
            break
    
    # [1:] to ignore first element since its dummy 0
    # -1 to offset back (it was offset to allow for dummy 0)
    return u_m[1:]-1, i_m[1:]-1, transactions_m[1:]

user_ids, item_ids, transaction_ids = get_user_network(41)

display(user_ids, user_ids.shape, item_ids, item_ids.shape, transaction_ids, transaction_ids.shape)

array([   39,    40,    41, ..., 52093, 52094, 52095])

(2077,)

array([   17,    43,    84, ..., 57264, 57267, 57277])

(7941,)

array([   339,    340,    341, ..., 394238, 394239, 394240])

(14131,)

In [13]:
# # run on all users
# st = time.time()
# selected_nodes = {}
# for u in tqdm(all_users[::-1]):
#     users, items, trans = get_user_network(u)
    
#     selected_nodes[u] = trans
    
# print(f"{time.time()-st} seconds")

In [14]:
import matplotlib.pyplot as plt
from torch_geometric.data import HeteroData

In [96]:
def make_graph_object(user_index, transaction_ids):
    """
    Makes PyTorch Heterograph not temporal for now
    """
    data = HeteroData()
    
    sub_df = df.loc[transaction_ids]
    print(sub_df)

    # get important transactions
    user_sequence_df = sub_df[sub_df['u']==user_index]
    final_item = user_sequence_df.iloc[-1]

    # remove transactions from the future
    sub_df = sub_df[sub_df['t'] < final_item['t']]
    
    if len(sub_df) < 1:
        return data

    # make graph
    
    # remap
    mapping_u = {u_id : i for i, u_id in enumerate(sub_df['u'].unique())}
    mapping_i = {i_id : i for i, i_id in enumerate(sub_df['i'].unique())}

    sub_df['u'] = sub_df['u'].map(mapping_u)
    sub_df['i'] = sub_df['i'].map(mapping_i)

    # make edge index
    users = torch.tensor(sub_df['u'].values)
    items = torch.tensor(sub_df['i'].values)

    # make edge weights
    relative_time = final_item['t'] - sub_df['t'].values
    weights = torch.tensor(1 - relative_time / max(relative_time))**3 # SHOULD BE EXPERIMENTED WITH
    
    # build object
    
    data['u'].x = torch.tensor(list(mapping_u.keys()))
    data['i'].x = torch.tensor(list(mapping_i.keys()))

    data['u', 'bought', 'i'].edge_index = torch.sparse_coo_tensor(
        torch.stack((users, items)),
        weights,
        size=(len(mapping_u), len(mapping_i))
    ).coalesce()
    
    data.y = final_item['i']   
    
    return data

user_ids, item_ids, transaction_ids = get_user_network(41)
make_graph_object(41, transaction_ids)

            u      i          t
339        39  34948  486604802
340        39  47011  486604803
341        39     17  487382400
342        39  48178  487382404
343        39  11203  487468800
...       ...    ...        ...
394236  52095  55660  473385602
394237  52095  55668  473385603
394238  52095  55669  473385604
394239  52095  56262  476928000
394240  52095  56511  480729600

[14131 rows x 3 columns]


HeteroData(
  y=17,
  [1mu[0m={ x=[2030] },
  [1mi[0m={ x=[7444] },
  [1m(u, bought, i)[0m={ edge_index=[2030, 7444] }
)

# O_iu experiment

In [98]:
df.head()

Unnamed: 0,u,i,t
0,0,12887,473731200
1,0,49582,475372800
2,0,0,476496000
3,0,4732,476496001
4,0,5760,476496002


In [115]:
def add_oui_column(dataf):
    """
    oui = o^u_i = order of u−i interaction
    = the position of item i in all items that the u has interacted with
    """
    dataf.loc[:,"oui"] = dataf.groupby("i")["t"].rank("first")
    return dataf

def add_oiu_column(dataf):
    """
    oiu = o^i_u = order of i−u interaction
    = the position of user u in all users that the i has interacted with
    """
    dataf.loc[:,"oiu"] = dataf.groupby("u")["t"].rank("first")
    return dataf

df = add_oui_column(df)
df = add_oiu_column(df)
df.head()

Unnamed: 0,u,i,t,oui,oiu
0,0,12887,473731200,2.0,1.0
1,0,49582,475372800,96.0,2.0
2,0,0,476496000,1.0,3.0
3,0,4732,476496001,46.0,4.0
4,0,5760,476496002,5.0,5.0


In [119]:
df[df['u']==0].sort_values('t')

Unnamed: 0,u,i,t,oui,oiu
0,0,12887,473731200,2.0,1.0
1,0,49582,475372800,96.0,2.0
2,0,0,476496000,1.0,3.0
3,0,4732,476496001,46.0,4.0
4,0,5760,476496002,5.0,5.0
5,0,10844,476496003,6.0,6.0
6,0,11209,476496004,15.0,7.0
7,0,26874,476496005,35.0,8.0
8,0,37881,476496006,16.0,9.0
9,0,39166,476496007,5.0,10.0


In [112]:
# oui = o^u_i
# = order of u−i interaction
# = the position of item i in all items that the u has interacted

In [113]:
df.sort_values("t")[0:50]

Unnamed: 0,u,i,t,oui
5805,421,102,68428800,1.0
6009,422,101,73267200,1.0
8605,585,432,89164800,1.0
5806,421,347,115084800,1.0
5807,421,344,115430400,1.0
5808,421,345,115430401,1.0
3264,225,489,117676800,1.0
3265,225,417,121564800,1.0
42292,3467,432,127008000,2.0
3447,226,90,127353600,1.0


In [16]:
np.random.shuffle(all_users)
min_graph_size = 100


st = time.time()

graphs = []

failed = 0
for u in tqdm(all_users[:10]):
    user_ids, item_ids, transaction_ids = get_user_network(u)
    
    if len(transaction_ids) < min_graph_size:
        failed += 1
        continue
        
    graph = make_graph_object(u, transaction_ids)
    
    del(user_ids)
    del(item_ids)
    del(transaction_ids)
    
    if len(graph) > 0:
        graphs.append(graph)
    
print(f"{time.time()-st} seconds")

  0%|          | 0/10 [00:00<?, ?it/s]

0.10800027847290039 seconds


# DGSR

In [18]:
import torch.nn as nn

In [86]:
def sparse_dense_mul(s, d):
    i = s._indices()
    v = s._values()
    dv = d[i[0,:], i[1,:]]  # get values from relevant entries of dense matrix
    return torch.sparse.FloatTensor(i, v * dv, s.size())

def add_messages(messages, adjacency):
    output = torch.zeros((adjacency.shape[0], messages.shape[1]), dtype=float)
        
    rows, cols = adjacency._indices()
    output.index_add_(0, rows, item_messages[cols] * adjacency._values().unsqueeze(-1))
    
    return output

In [93]:
class DGSRConv(nn.Module):
    def __init__(self):
        pass
        
    def forward(self, graph):
        pass
        

class DGSRNetwork(nn.Module):
    def __init__(self,
                 user_num, item_num,
                 hidden_size,
                 user_max, item_max
                ):
        super().__init__()
        """ init """
        self.user_vocab_num = user_num
        self.item_vocab_num = item_num
        
        self.user_max = user_max
        self.item_max = item_max
        
        self.hidden_size = hidden_size
        self.sqrt_d = np.sqrt(self.hidden_size)
        
        """ layers """
        self.user_embedding = nn.Embedding(self.user_vocab_num, self.hidden_size)
        self.item_embedding = nn.Embedding(self.item_vocab_num, self.hidden_size)
        
        self.w1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # Long Term User
        self.w2 = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # Long Term Item
        
        self.w3 = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # Short Term User
        self.w4 = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # Short Term Item
        
        self.wp = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # Recommendations
        
        self.pV = nn.Embedding(self.user_max, self.hidden_size) # user positional embedding
        self.pK = nn.Embedding(self.item_max, self.hidden_size) # item positional embedding
        
        
    def forward(self, graph):
        # DEBUG
        users_in_graph = graph['u'].x.shape[0]
        items_in_graph = graph['i'].x.shape[0]
        print(f"Working with {users_in_graph} users, {items_in_graph} items")
        
        # turn node ids into the learned features
        u_embedded = self.user_embedding(graph['u'].x) # (u, h)
        i_embedded = self.item_embedding(graph['i'].x) # (i, h)
        
        # --- long term ---
        user_messages = self.w1(u_embedded) # (u, h)
        item_messages = self.w2(i_embedded) # (i, h)
        
        # - users to items -
        
        # calculate attention
        # TODO +p
        e_ui = (user_messages) @ (item_messages).T / self.sqrt_d # (u, i)
        
        bought_e_ui = sparse_dense_mul(graph['u', 'bought', 'i'].edge_index, e_ui) # (u, i)
        alphas = torch.sparse.softmax(bought_e_ui, 1) # (u, i)
        
        # TODO +p
        longterm_hu = add_messages(item_messages, alphas)
        
        # - items to users -
        
        # calculate attention
        # TODO +p
        e_iu = (item_messages) @ (user_messages).T / self.sqrt_d # (i, u)
        
        bought_e_iu = sparse_dense_mul(torch.transpose(graph['u', 'bought', 'i'].edge_index, 0, 1), e_iu)
        betas = torch.sparse.softmax(bought_e_iu, 1) # (u, i)
        
        longterm_hi = add_messages(user_messages, betas)
        
        
        
"""
Make network
"""
user_num = len(df['u'].unique())
item_num = len(df['i'].unique())

hidden_size = 64
        
network = DGSRNetwork(user_num, item_num, hidden_size, user_max=n, item_max=n)

"""
Forward that shit
"""

graph = graphs[0]
out = network(graph)

# Debug
# users_in_graph, hidden_size, alphas, item_messages = out

Working with 164 users, 463 items


In [95]:
n

10

In [84]:
add_messages(item_messages, alphas)

tensor([[-4.3832e-04, -2.3621e-02, -3.4476e-01,  ..., -3.5867e-01,
         -2.6082e-01,  3.0540e-01],
        [ 1.4006e-01, -2.4459e-01, -1.0979e-01,  ..., -5.2434e-01,
         -3.1957e-01,  9.8245e-02],
        [ 7.5556e-01,  9.6443e-01, -1.7022e-01,  ..., -5.6743e-01,
          2.9971e-01,  2.3845e-01],
        ...,
        [ 2.4882e-01,  1.6368e-01, -3.3535e-01,  ...,  1.3057e-01,
          3.2077e-01, -1.0604e-01],
        [-8.3952e-01,  6.8585e-01, -2.2096e-01,  ..., -7.4191e-01,
         -3.8607e-01,  9.3357e-01],
        [-1.2401e+00,  1.4072e-02,  1.6794e-01,  ...,  6.0907e-01,
         -2.2924e-02, -5.7335e-01]], dtype=torch.float64,
       grad_fn=<IndexAddBackward0>)

In [53]:
longterm_hu = torch.zeros((users_in_graph, hidden_size))  # (u, h)
rows, cols = alphas._indices()
longterm_hu.index_add_(0, rows, item_messages[cols])

tensor([[-0.0227,  0.0445, -3.0830,  ..., -3.2838, -2.3934,  2.4307],
        [ 0.1032, -0.8667, -0.4711,  ..., -1.5320, -1.1399,  0.3332],
        [ 0.7556,  0.9644, -0.1702,  ..., -0.5674,  0.2997,  0.2384],
        ...,
        [ 2.1481,  0.6528, -1.6534,  ...,  0.5289,  1.0922, -0.3999],
        [-0.8395,  0.6859, -0.2210,  ..., -0.7419, -0.3861,  0.9336],
        [-1.2401,  0.0141,  0.1679,  ...,  0.6091, -0.0229, -0.5733]],
       grad_fn=<IndexAddBackward0>)

In [62]:
rows

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   2,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,
          4,   4,   5,   6,   6,   6,   7,   7,   7,   7,   8,   8,   8,   8,
          8,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,  10,  10,
         10,  10,  10,  10,  10,  10,  10,  11,  12,  12,  13,  13,  13,  13,
         13,  13,  14,  15,  15,  15,  15,  16,  17,  18,  18,  18,  18,  19,
         19,  20,  21,  22,  22,  22,  22,  22,  23,  23,  23,  23,  23,  23,
         24,  24,  24,  24,  24,  24,  25,  25,  25,  25,  25,  26,  26,  26,
         26,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,  27,  28,  28,
         29,  30,  30,  30,  30,  31,  31,  31,  31,  32,  32,  32,  32,  32,
         32,  32,  32,  32,  32,  32,  33,  34,  34,  34,  34,  35,  35,  35,
         35,  35,  35,  36,  36,  37,  37,  38,  39,  39,  40,  40,  40,  40,
         40,  40,  41,  41,  41,  42,  42,  42,  43,  43,  44,  

In [63]:
cols

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,   0,   0,
         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,   0,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  22,  30,  62,  63,
         64,  65,  62,   1,  62,  66,  67,  68,  69,  70,  71,  72,  73,   0,
         74,  75,  76,  33,  63,  77,  78,  79,  80,  81,  82,  83,  84,  85,
         86,  87,  88,  89,  90,  91,  30,  92,  93,  94,  95,  96,  97,  98,
         99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113,  83, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132,  83, 133, 134, 135,   0, 136, 137,
        138, 139, 140, 141, 142, 110, 143, 144,  64, 145,  83, 146, 147, 148,
        149, 150,  83, 151, 152, 153, 154, 155,  65, 156, 157, 1

In [66]:
item_messages[cols].shape

torch.Size([547, 64])

In [70]:
alphas._values().unsqueeze(-1) * item_messages[cols]

tensor([[ 0.0829,  0.1058, -0.0187,  ..., -0.0622,  0.0329,  0.0262],
        [ 0.1037, -0.0168, -0.0627,  ...,  0.0135,  0.0084, -0.0035],
        [ 0.0413,  0.0027, -0.0408,  ..., -0.0449, -0.0937,  0.0287],
        ...,
        [ 0.0103, -0.1945,  0.1866,  ...,  0.2367,  0.2369,  0.1352],
        [-0.8395,  0.6859, -0.2210,  ..., -0.7419, -0.3861,  0.9336],
        [-1.2401,  0.0141,  0.1679,  ...,  0.6091, -0.0229, -0.5733]],
       dtype=torch.float64, grad_fn=<MulBackward0>)