In [2]:
import numpy as np
import pandas as pd
import os
import json
import dgl
import torch
import math
from collections import defaultdict

class Graph(object):
    def __init__(self, dataset):
        super(Graph, self).__init__()
        self.dataset = dataset
        self.pid2pid, self.rid2rid, self.pid2rid = self.load_datas()
        self.user2id, self.poi2id, self.region2id = self.load_id_map()
        self.users = set(self.user2id.keys())
        self.pois = set(self.poi2id.keys())
        self.regions = set(self.region2id.keys())
        self.num_users, self.num_pois, self.num_regions = map(len, [self.users, self.pois, self.regions])
        print(self.num_users, self.num_pois)
        self.train, self.test = self.load_train_test()
        self.time_train, self.time_test = self.time_convert()
        self.pid_pid_norm, self.user_pid_norm, self.region_region_norm = self.norm()
        self.g = self.build_graph()
        self.neighbors = self.get_neighbors()
        print('build graph done')
        self.embeddings = self.get_init_embeddings()
        print('load embeddings done')
        # self.time_user_records = self.build_records()

    def load_datas(self):
        poi2poi_file = './dataset/'+ self.dataset + '/gowalla_travel/poi2poi_train.txt'
        region_file = './dataset/'+ self.dataset + '/gowalla_travel/region2region.txt'
        poi2poi = pd.read_csv(poi2poi_file, sep='\t', header=None, names=['p1', 'p2', 'w'])
        # poi2poi = poi2poi[poi2poi['w'] > 1].reset_index()
        region2region = pd.read_csv(region_file, sep='\t', header=None, names=['r1', 'r2', 'w'])
        poi2region_file = './dataset/'+ self.dataset + '/gowalla_travel/poi_region.txt'
        poi2region = pd.read_csv(poi2region_file, sep='\t', header=None, names=['p', 'r', 'w'])
        return poi2poi, region2region, poi2region

    def load_id_map(self):
        user2id_file = './dataset/'+ self.dataset + '/gowalla_travel/user2id.json'
        poi2id_file = './dataset/'+ self.dataset + '/gowalla_travel/poi2id.json'
        region2id_file = './dataset/'+ self.dataset + '/gowalla_travel/region2id.json'
        user2id = json.load(open(user2id_file))
        poi2id = json.load(open(poi2id_file))
        region2id = json.load(open(region2id_file))
        return user2id, poi2id, region2id

    def load_train_test(self):
        train_file = './dataset/'+ self.dataset + '/gowalla_travel/train.csv'
        test_file = './dataset/'+ self.dataset + '/gowalla_travel/test.csv'
        train = pd.read_csv(train_file)
        test = pd.read_csv(test_file)
        train['time'] = train['interval'].apply(lambda x: int(x.split(',')[0][1:]))
        test['time'] = test['interval'].apply(lambda x: int(x.split(',')[0][1:]))
        return train, test

    def time_convert(self):
        time_train = {}
        time_test = {}
        train_grouped = self.train.groupby(['time'])
        test_grouped = self.test.groupby(['time'])
        if self.dataset == 'meituan':
            for time, group in train_grouped:
                time_train[time] = group[['uid', 'pid', 'user_region', 'region']].to_numpy(copy=True)
            for time, group in test_grouped:
                time_test[time] = group[['uid', 'pid', 'user_region', 'region']].to_numpy(copy=True)
        else:
            for time, group in train_grouped:
                time_train[time] = group[['uid', 'pid', 'region']].to_numpy(copy=True)
            for time, group in test_grouped:
                time_test[time] = group[['uid', 'pid',  'region']].to_numpy(copy=True)
        return time_train, time_test

    def norm(self):
        '''calculate the norm of every type of edges'''
        pid_pid_norm = defaultdict(int)
        for index, row in self.pid2pid.iterrows():
            p1 = row['p1']
            p2 = row['p2']
            w = row['w']
            # pid_pid_norm[p1] += w
            # pid_pid_norm[p2] += w
            pid_pid_norm[p1] += 1
            pid_pid_norm[p2] += 1
        user_pid_norm = defaultdict(int)
        for index, row in self.train.iterrows():
            user = row['uid']
            poi = row['pid']
            user_pid_norm[user] += 1
            user_pid_norm[poi] += 1
        region_region_norm = defaultdict(int)
        for index, row in self.rid2rid.iterrows():
            r1 = row['r1']
            r2 = row['r2']
            region_region_norm[r1] += 1
            region_region_norm[r2] += 1
        return pid_pid_norm, user_pid_norm, region_region_norm

    def read_vector(self, file):
        vec_dict = {}
        with open(file) as f:
            info = f.readline()
            count, dim = info.strip().split()
            # assert count == self.num_users + self.num_pois + self.num_regions
            print('vec', count)
            print('total', self.num_users + self.num_pois + self.num_regions)
            for line in f:
                info = line.strip().split()
                key = info[0]
                value = torch.tensor([float(i) for i in info[1:]])
                vec_dict[key] = value
        return vec_dict

    def get_init_embeddings(self):
        vec_dict = self.read_vector('./dataset/' + self.dataset + '/gowalla_travel/line_embedding.txt')
        embeddings = torch.zeros((self.num_users+self.num_pois+self.num_regions, 64))
        torch.nn.init.xavier_uniform_(embeddings)
        for k, v in vec_dict.items():
            k = int(k)
            embeddings[k] = v
        return embeddings

    def get_neighbors(self):
        pid2pid = self.pid2pid[['p1', 'p2']].values
        rid2rid = self.rid2rid[['r1', 'r2']].values
        pid2rid = self.pid2rid[['p', 'r']].values
        uid2pid = self.train[['uid', 'pid']].values
        neighbors = np.concatenate((pid2pid, rid2rid, pid2rid, uid2pid), axis=0)
        return neighbors

    def build_graph(self):
        g = dgl.DGLGraph(multigraph=True)
        g.add_nodes(self.num_users + self.num_pois + self.num_regions)
        # add poi to poi edges
        g.add_edges(
            self.pid2pid['p1'],
            self.pid2pid['p2'],
            data={'weight': torch.FloatTensor(self.pid2pid['w']),
                  'type': torch.LongTensor([0]*len(self.pid2pid)),
                  'time': torch.IntTensor([-1]*len(self.pid2pid)),
                  'norm': torch.FloatTensor([self.pid_pid_norm[i] for i in self.pid2pid['p2']])
                  }
        )
        g.add_edges(
            self.pid2pid['p2'],
            self.pid2pid['p1'],
            data={'weight': torch.FloatTensor(self.pid2pid['w']),
                  'type': torch.LongTensor([0]*len(self.pid2pid)),
                  'time': torch.IntTensor([-1]*len(self.pid2pid)),
                  'norm': torch.FloatTensor([self.pid_pid_norm[i] for i in self.pid2pid['p1']])
                  }
        )
        # add region to region edges
        g.add_edges(
            self.rid2rid['r1'],
            self.rid2rid['r2'],
            data={'weight': torch.FloatTensor([1] * len(self.rid2rid)),
                  'type': torch.LongTensor([1]*len(self.rid2rid)),
                  'time': torch.IntTensor([-1]*len(self.rid2rid)),
                  'norm': torch.FloatTensor([self.region_region_norm[i] for i in self.rid2rid['r2']])
                  }
        )
        g.add_edges(
            self.rid2rid['r2'],
            self.rid2rid['r1'],
            data={'weight': torch.FloatTensor([1] * len(self.rid2rid)),
                  'type': torch.LongTensor([1]*len(self.rid2rid)),
                  'time': torch.IntTensor([-1]*len(self.rid2rid)),
                  'norm': torch.FloatTensor([self.region_region_norm[i] for i in self.rid2rid['r1']])
                  }
        )
        # add region to poi edges
        g.add_edges(
            self.pid2rid['r'],
            self.pid2rid['p'],
            data={'weight': torch.FloatTensor([1] * len(self.pid2rid)),
                  'type': torch.LongTensor([2] * len(self.pid2rid)),
                  'time': torch.IntTensor([-1]*len(self.pid2rid)),
                  'norm': torch.FloatTensor([1] * len(self.pid2rid))
                  }
        )
        # add region to user edges
        # data1 = self.train[['uid', 'region', 'time']]
        # data1 = data1.groupby(data1.columns.tolist()).size().reset_index().rename(columns={0: 'weight'})
        # data1['type'] = data1['time'].apply(lambda x: 27 + x // 2)
        # g.add_edges(
        #     data1['region'],
        #     data1['uid'],
        #     data={'weight': torch.FloatTensor(data1['weight']),
        #           'type': torch.LongTensor(data1['type']),
        #           'time': torch.IntTensor(data1['time']),
        #           'norm': torch.FloatTensor([self.user_pid_norm[i] for i in data1['uid']])
        #           }
        # )

        # add user to poi edges
        data = self.train[['uid', 'pid', 'time']]
        data = data.groupby(data.columns.tolist()).size().reset_index().rename(columns={0: 'weight'})
        data['type'] = data['time'].apply(lambda x: 3 + x // 2)
        data['type1'] = data['time'].apply(lambda x: 15 + x // 2)
        g.add_edges(
            data['uid'],
            data['pid'],
            data={'weight': torch.FloatTensor(data['weight']),
                  'type': torch.LongTensor(data['type']),
                  'time': torch.IntTensor(data['time']),
                  'norm': torch.FloatTensor([self.user_pid_norm[i] for i in data['pid']])
                  }
        )
        # add poi to user edges
        g.add_edges(
            data['pid'],
            data['uid'],
            data={'weight': torch.FloatTensor(data['weight']),
                  'type': torch.LongTensor(data['type1']),
                  'time': torch.IntTensor(data['time']),
                  'norm': torch.FloatTensor([self.user_pid_norm[i] for i in data['uid']])
                  }
        )
        return g


if __name__ == '__main__':
    graph = Graph(dataset='gowalla')
    # print(graph.region_region_norm)
    print(graph.num_users, graph.num_pois, graph.num_regions)
    print(len(graph.train), len(graph.test))
    # rid2rid = graph.rid2rid.values
    # print(graph.region2id.values())
    # neg_rid = np.random.choice(list(graph.region2id.values()), rid2rid.shape[0], replace=True).reshape(-1, 1)
    # rid2rid = np.concatenate((rid2rid, neg_rid), axis=1)
    # print(rid2rid)
    # g = graph.g
    # print(g)

7856 89872




build graph done
vec 153322
total 153322
load embeddings done
7856 89872 55594
508102 127189


In [3]:
import torch
import torch.nn as nn

torch.cuda.set_device(0)
device = torch.device('cuda:0')


class Layer(nn.Module):
    def __init__(self, dim, num_rels, activation):
        super(Layer, self).__init__()
        self.dim = dim
        self.num_rels = num_rels
        self.weight = nn.Parameter(torch.Tensor(num_rels, dim, dim))
        # self.W = nn.Parameter(torch.Tensor(dim, dim))
        # nn.init.xavier_uniform_(self.W)
        nn.init.xavier_uniform_(self.weight)
        self.activation = activation

    def message_func(self, edges):
        h = edges.src['h']
        rel_type = edges.data['type']
        weight = edges.data['final_weight']
        h *= weight.unsqueeze(1)
        w = self.weight[rel_type]
        msg = torch.bmm(h.unsqueeze(1), w).squeeze()
        return {'msg': msg}

    def reduce_func(self, nodes):
        h = nodes.data['h']
        # h = torch.matmul(h, )
        # h = self.W(h)
        m = nodes.mailbox['msg']
        m = m.sum(dim=1, keepdim=True)
        m = m / m.norm(dim=2, keepdim=True).clamp(min=1e-6)
        h = h.unsqueeze(1)
        h_new = torch.cat((m, h), 1).sum(dim=1)
        if self.activation:
            h_new = self.activation(h_new)
        return {'h': h_new / h_new.norm(dim=1, keepdim=True)}

    def forward(self, nf, i_layer):
        nf.block_compute(i_layer, self.message_func, self.reduce_func)
        return nf


class STGCN(nn.Module):
    def __init__(self, num_nodes, dim, num_rels, num_layers, activation, embeddings):
        super(STGCN, self).__init__()
        self.dim = dim
        self.num_rels = num_rels
        self.activation = activation
        self.embeddings = nn.Embedding(num_nodes, dim)
        # nn.init.xavier_uniform_(self.embeddings.weight)
        self.embeddings.weight.data = embeddings
        self.num_layers = num_layers
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(Layer(dim, num_rels, activation))

    def forward(self, nf):
        for i in range(nf.num_layers):
            nids = nf.layer_parent_nid(i).cuda()
            nf.layers[i].data['h'] = self.embeddings(nids)
        for i in range(self.num_layers):
            nf = self.layers[i](nf, i)
        result = nf.layers[self.num_layers].data['h']
        return result


class Recommender(nn.ModuleList):
    """
    Recommender
    score = Ut * P + Ut * Lp + P * Lu + Lp * Lu
    """
    def __init__(self, gcn):
        super(Recommender, self).__init__()
        self.gcn = gcn
        self.logsigmoid = nn.LogSigmoid()

    def forward(self, nf, data, has_user_region=True):
        h = self.gcn(nf)
        # already convert to nodeflow id
        user = h[data[:, 0]]
        pos_poi = h[data[:, 1]]
        if has_user_region:
            user_region = h[data[:, 2]]
            poi_region = h[data[:, 3]]
            neg_poi = h[data[:, 4]] 
            neg_poi_region = h[data[:, 5]]
            pos_score = user * pos_poi + 0.1 * user * poi_region + pos_poi * user_region + poi_region * user_region
            neg_score = user * neg_poi + 0.1 * user * neg_poi_region + neg_poi * user_region + neg_poi_region * user_region
        else:
            poi_region = h[data[:, 2]]
            neg_poi = h[data[:, 3]]
            neg_poi_region = h[data[:, 4]]
            pos_score = user * pos_poi + user * poi_region
            neg_score = user * neg_poi + user * neg_poi_region
        pos_score = pos_score.sum(1)
        neg_score = neg_score.sum(1)
        maxi = self.logsigmoid(pos_score - neg_score)
        loss = -maxi.mean()
        return loss

    def train_region(self, nf, data):
        h = self.gcn(nf)
        r1 = h[data[:, 0]]
        r2 = h[data[:, 1]]
        r3 = h[data[:, 2]]
        pos_score = (r1 * r2).sum(1)
        neg_score = (r1 * r3 + r2 * r3).sum(1)
        maxi = self.logsigmoid(pos_score - neg_score)
        loss = -maxi.mean()
        return loss

    def infer(self, nf):
        h = self.gcn(nf)
        return h

NVIDIA GeForce RTX 4060 Laptop GPU with CUDA capability sm_89 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 compute_37.
If you want to use the NVIDIA GeForce RTX 4060 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [4]:
import numpy as np
import torch
import torch.nn.functional as F
import time
import dgl
from functools import partial
from collections import defaultdict
import math

torch.cuda.set_device(0)
device = torch.device('cuda:0')


def edge_func(time0, edges):
    rel_type = edges.data['type']
    time = edges.data['time']
    weight = edges.data['weight']
    norm = edges.data['norm']
    timedelta = torch.abs(time - time0) / 2
    msk = rel_type < 3
    timedelta[msk] = 0
    timedelta = -timedelta.float()
    time_weight = torch.exp(timedelta)
    final_weight = weight * time_weight
    final_weight = final_weight * (1 / norm)
    return {'final_weight': final_weight}


def edge_func1(edges):
    rel_type = edges.data['type']
    time = edges.data['time']
    weight = edges.data['weight']
    norm = edges.data['norm']
    final_weight = weight * (1 / norm)
    return {'final_weight': final_weight}


def recallk(graph, model, dim=64, batch_size=1024, layers=2, samples=5, has_user_region=True):
    k_list = [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    # full rank recall
    test_record = graph.time_test
    g = graph.g
    g.readonly()
    pid2rid = torch.LongTensor(graph.pid2rid[['p', 'r']].values).cuda()
    pid2indices = {}
    for index, pid in enumerate(graph.pid2rid['p'].to_list()):
        pid2indices[pid] = index
    accuracy = defaultdict(int)
    ndcg = defaultdict(float)
    for t in range(0, 24, 2):
        g.apply_edges(partial(edge_func, t))
        sampler = dgl.contrib.sampling.NeighborSampler(
            g,
            batch_size,
            samples,
            layers,
            seed_nodes=torch.arange(g.number_of_nodes()),
            transition_prob='final_weight',
            num_workers=16,
        )
        emb = torch.empty((g.number_of_nodes(), dim), device=device)
        for nf in sampler:
            nf.copy_from_parent(ctx=device)
            batch_nids = nf.layer_parent_nid(-1).long()
            h = model.infer(nf)
            emb[batch_nids] = h
        record_t = test_record[t]
        pois_indices = np.array([pid2indices[i] for i in record_t[:, 1]]).reshape(-1, 1)
        record_t = np.concatenate((record_t, pois_indices), 1)
        tests = torch.from_numpy(record_t).cuda()
        # test_batches = tests.split(batch_size)
        for test in tests:
            user = emb[test[0]]
            if has_user_region:
                user_region = emb[test[2]]
                true_indices = test[4]
            else:
                true_indices = test[3]
            pois = emb[pid2rid[:, 0]]
            pois_region = emb[pid2rid[:, 1]]
            if has_user_region:
                scores = user * pois + 0.1 * user * pois_region + user_region * pois + user_region * pois_region
            else:
                scores = user * pois + user * pois_region
            scores = scores.sum(1)
            scores, indices = torch.sort(scores, descending=True)
            position = (indices == true_indices).nonzero().item()
            for k in k_list:
                if position < k:
                    accuracy[k] += 1
                    ndcg[k] += 1 / math.log2(position+2)
    for k in k_list:
        accuracy[k] /= len(graph.test)
        ndcg[k] /= len(graph.test)
    print(accuracy)
    print(ndcg)


def main(dataset):
    batch_size = 1024
    #graph = Graph(dataset)
    if dataset == 'meituan':
        data_size = 6
        has_user_region=True
    else:
        data_size = 5
        has_user_region = False
    g = graph.g
    g.readonly()
    embeddings = graph.embeddings
    num_nodes = graph.g.number_of_nodes()
    model = Recommender(STGCN(num_nodes, 64, 27, 2, None, embeddings))
    model.cuda()
    opt = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0)
    model.train()
    print('train neighbors')
    for i in range(5):
        total_loss = 0
        g.apply_edges(edge_func1)
        neighbors = graph.neighbors
        neg_neighbors = np.random.choice(range(num_nodes), neighbors.shape[0], replace=True).reshape(-1, 1)
        neighbors_data = np.concatenate((neighbors, neg_neighbors), axis=1)
        data = torch.from_numpy(neighbors_data).cuda()
        seed_nodes = data.reshape(-1)
        batches = data.split(batch_size)
        sampler = dgl.contrib.sampling.NeighborSampler(
            g,
            batch_size * 3,
            5,
            2,
            seed_nodes=seed_nodes,
            num_workers=11)
        count = 0
        for batch, nf in zip(batches, sampler):
            nf.copy_from_parent(ctx=device)
            batch_nid = nf.map_from_parent_nid(-1, batch.reshape(-1), True)
            batch_nid = batch_nid.reshape(-1, 3).cuda()
            loss = model.train_region(nf, batch_nid)
            opt.zero_grad()
            loss.backward()
            total_loss += loss.item()
            opt.step()
            count += 1
        print('loss', total_loss / count)

    for epoch in range(100):
        model.train()
        begin = time.time()
        total_loss = 0
        count = 0
        for t in range(0, 24, 2):
            g.apply_edges(partial(edge_func, t))
            pos = graph.time_train[t]
            neg_pois = graph.pid2rid[['p', 'r']].sample(n=pos.shape[0], replace=True).to_numpy(copy=True)
            data = np.concatenate((pos, neg_pois), axis=1)
            data.astype(np.int)
            data = torch.from_numpy(data).cuda()
            seed_nodes = data.reshape(-1)
            batches = data.split(batch_size)
            sampler = dgl.contrib.sampling.NeighborSampler(
                g,
                batch_size * data_size,
                5,
                2,
                seed_nodes=seed_nodes,
                transition_prob='final_weight',
                prefetch=False,
                num_workers=11)
            for batch, nf in zip(batches, sampler):
                nf.copy_from_parent(ctx=device)
                batch_nid = nf.map_from_parent_nid(-1, batch.reshape(-1), True)
                batch_nid = batch_nid.reshape(-1, data_size).cuda()
                loss = model(nf, batch_nid, has_user_region=has_user_region)
                opt.zero_grad()
                loss.backward()
                total_loss += loss.item()
                opt.step()
                count += 1
        print('epoch:{}, loss:{}, time:{}'.format(epoch, total_loss / count, time.time() - begin))
        if epoch % 20 ==0 and epoch != 0:
            model.eval()
            with torch.no_grad():
                recallk(graph, model, batch_size=1024,  has_user_region=has_user_region)
    model.eval()
    with torch.no_grad():
        recallk(graph, model, batch_size=1024, has_user_region=has_user_region)


if __name__ == "__main__":
    dataset = 'gowalla'
    main(dataset)

train neighbors
loss 0.35686519186333954
loss 0.3543015772352684
loss 0.3533868330788998
loss 0.3529300504484862
loss 0.3522719664421021
epoch:0, loss:0.2677007866579862, time:61.26017880439758
epoch:1, loss:0.2507171932134836, time:36.016666889190674
epoch:2, loss:0.2441574807559687, time:36.08630681037903
epoch:3, loss:0.2395424581472836, time:36.176307678222656
epoch:4, loss:0.2355300947609875, time:36.30205798149109
epoch:5, loss:0.23197882835354125, time:36.3623571395874
epoch:6, loss:0.22922285403760653, time:36.242438316345215
epoch:7, loss:0.22618711408641604, time:36.291765213012695
epoch:8, loss:0.22358614862674758, time:37.51999354362488
epoch:9, loss:0.22148671625034203, time:36.40708661079407
epoch:10, loss:0.21921326137251324, time:36.71625018119812
epoch:11, loss:0.2168890724756888, time:37.45523428916931
epoch:12, loss:0.21524287056591776, time:36.35960412025452
epoch:13, loss:0.2135110819741847, time:36.14732503890991
epoch:14, loss:0.21208287361595365, time:38.1621861

epoch:94, loss:0.17403559948480318, time:103.74465560913086
epoch:95, loss:0.17398931668509568, time:103.60610365867615
epoch:96, loss:0.1739817327331929, time:103.67983222007751
epoch:97, loss:0.17372075907353843, time:103.51485252380371
epoch:98, loss:0.17349587607064418, time:108.28437662124634
epoch:99, loss:0.17345893521985364, time:115.7399435043335
defaultdict(<class 'int'>, {70: 0.3927147787937636, 80: 0.4050979251350353, 90: 0.41685208626532166, 100: 0.42756055948234517, 40: 0.3403124484035569, 50: 0.36080164165140066, 60: 0.37853902460118405, 2: 0.11858729921612718, 5: 0.17584854036119477, 10: 0.22260572848280905, 20: 0.2773353041536611, 30: 0.3128257946835025})
defaultdict(<class 'float'>, {70: 0.18136683643245521, 80: 0.1833467024644226, 90: 0.18517418480319758, 100: 0.18679861622570154, 40: 0.17226878917281746, 50: 0.1759728710946173, 60: 0.17902439595749992, 2: 0.10492007279971134, 5: 0.13052034646373045, 10: 0.1456284835329267, 20: 0.159409202325442, 30: 0.16695764432084