In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
import torch_geometric.nn as gnn

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import random

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


### Edge Level

* What score will a user rank for a movie, even though current he/she haven't seen that yet? 
    - can be viewed as Predict possible new edges and edge values
    - Using MovieLens Dataset

```
wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
unzip ml-1m.zip
```

更清晰的GNN学习代码：
```
https://github.com/zhulei227/GNN_Notes
https://www.less-bug.com/posts/gcn-basis-graphconv-gatconv-sageconv-implementation-pyg-dgl/
```

benchmark 数据集: https://ogb.stanford.edu/

In [2]:
users_df = pd.read_csv('ml-1m/users.dat',
                      sep='::',
                      header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'],
                      engine='python')

movies_df = pd.read_csv('ml-1m/movies.dat',
                        sep='::',
                        header=None,
                        names=['movie_id', 'title', 'genres'],
                        engine='python',
                        encoding='ISO-8859-1')

ratings_df = pd.read_csv('ml-1m/ratings.dat',    ## rates as edge label
                         sep='::',
                         header=None,
                         names=['user_id', 'movie_id', 'imdbRank', 'timestamp'],
                         engine='python')

def one_hot(vocab,input_lst):
    vec = np.zeros((len(vocab),))
    for i in input_lst:
        vec[vocab.index(i)] = 1
    return vec


movies_vocab = list(set('|'.join(movies_df['genres'].values).split('|')))
movies_df['vec'] = movies_df['genres'].apply(lambda x: one_hot(movies_vocab,x.split('|')))

users_vocab = users_df['occupation'].unique().tolist()
users_df['vec'] = users_df.apply(lambda x:  np.concatenate((  np.array([1 if x['gender']=='F' else 0]),  one_hot(users_vocab,[x['occupation']])   )), axis=1)

ratings_vocab = ratings_df['imdbRank'].unique().tolist()
ratings_df['vec'] = ratings_df['imdbRank'].apply(lambda x: one_hot(ratings_vocab,[x]))


movie_dict = dict(zip(movies_df['movie_id'].to_list(),[i for i in range(len(movies_df['movie_id']))]))
user_dict = dict(zip(users_df['user_id'].to_list(),[i for i in range(len(users_df['user_id']))]))
from_ = torch.tensor(ratings_df['user_id'].apply(lambda x: user_dict[x]).to_numpy(),dtype=torch.int64)
to_ = torch.tensor(ratings_df['movie_id'].apply(lambda x: movie_dict[x]).to_numpy(),dtype=torch.int64)



def series_to_tensor(Series,dtype=torch.float32):
    return torch.tensor(np.array(Series.apply(lambda x: torch.tensor(x)).to_list()),dtype=dtype)

### Load as HeteroData

* User nodes and Movie nodes are of different types

* Add negative samples: 
```
NodeA ----1---->NodeB   samples
NodeA ----0---->NodeC   negative samples
```

* RandomLinkSplit()向每个分割数据添加两个新属性给 `disjoint_train_ratio` edges
    - edge_label_index 是分割后的 edge_index
    - edge_label 对于相应的edge_label_index都是默认为1 (本例中我们没有在这一步生成0标签edge--即negative samples) 大概起到mask的作用，需要手动设置回自定义的label
 

The distinction between message passing edges and supervision edges happens in train_data.**edge_index (message passing)** and train_data.**edge_label_index (supervision)**.

In [3]:
from torch_geometric.data import HeteroData 
import torch_geometric.transforms as T
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.utils import negative_sampling

data = HeteroData()

data["user"].node_id = torch.arange(users_df.shape[0]) #data['user'].num_nodes = users_df.shape[0]
data['user'].x = series_to_tensor(users_df['vec'])
data["movie"].node_id = torch.arange(movies_df.shape[0]) 
data['movie'].x = series_to_tensor(movies_df['vec'])

data['user','to','movie'].edge_index = torch.stack([from_,to_], dim=0)#torch.tensor(np.array([from_,to_]))
data['user','to','movie'].edge_attr = series_to_tensor(ratings_df['imdbRank'], dtype=torch.int64)  ##vec
data = T.ToUndirected()(data)

data , data.edge_types

(HeteroData(
   user={
     node_id=[6040],
     x=[6040, 22],
   },
   movie={
     node_id=[3883],
     x=[3883, 18],
   },
   (user, to, movie)={
     edge_index=[2, 1000209],
     edge_attr=[1000209],
   },
   (movie, rev_to, user)={
     edge_index=[2, 1000209],
     edge_attr=[1000209],
   }
 ),
 [('user', 'to', 'movie'), ('movie', 'rev_to', 'user')])

In [4]:
'''
随机选取edge后，
disjoint_train_ratio: 设定此数值，则将被选取的edges拆分为两部分，一部分负责message passing，另一部分负责supervision
     disjoint_train_ratio指负责supervision的比例
'''

split = RandomLinkSplit(
    num_val = 0.3,
    num_test = 0.3,
    disjoint_train_ratio = 0.3,    
    add_negative_train_samples = False,    # neg_sampling_ratio = 1,
    edge_types = ('user', 'to', 'movie'),
    rev_edge_types = ('movie', 'rev_to', 'user')
)

train_data, val_data, test_data = split(data) 

train_data, train_data['user','to','movie'].edge_label

(HeteroData(
   user={
     node_id=[6040],
     x=[6040, 22],
   },
   movie={
     node_id=[3883],
     x=[3883, 18],
   },
   (user, to, movie)={
     edge_index=[2, 280060],
     edge_attr=[280060],
     edge_label=[120025],
     edge_label_index=[2, 120025],
   },
   (movie, rev_to, user)={
     edge_index=[2, 280060],
     edge_attr=[280060],
   }
 ),
 tensor([1., 1., 1.,  ..., 1., 1., 1.]))

In [5]:
def idx_to_dict(idx):
    return {'{}_{}'.format(idx[0,i],idx[1,i]):i for i in range(idx.size()[1])}

full_idx = data['user','to','movie'].edge_index
msg_idx = train_data['user','to','movie'].edge_index
sup_idx = train_data['user','to','movie'].edge_label_index

full_dict = idx_to_dict(full_idx)
full_pairs = full_dict.keys()
msg_pairs = idx_to_dict(msg_idx).keys()
sup_pairs = idx_to_dict(sup_idx).keys()                                  

assert set(msg_pairs) &  set(sup_pairs)  == set()                       ## check Intersection: Currently we don't have 0 edges (not in original edges)
assert len(set(full_pairs) &  set(sup_pairs))  == len(set(sup_pairs))   ##  sup, msg edges can all be found in the original edges
assert len(set(full_pairs) &  set(msg_pairs))  == len(set(msg_pairs))   ##  sup, msg edges have no Intersection

In [6]:
## set edge_label as edge_attr (mention: currently no neg samples)

ori_label_pos = [full_dict[p] for p in sup_pairs]

train_data['user','to','movie'].edge_label = data['user','to','movie'].edge_attr[ori_label_pos]
train_data, train_data['user','to','movie'].edge_label

(HeteroData(
   user={
     node_id=[6040],
     x=[6040, 22],
   },
   movie={
     node_id=[3883],
     x=[3883, 18],
   },
   (user, to, movie)={
     edge_index=[2, 280060],
     edge_attr=[280060],
     edge_label=[120025],
     edge_label_index=[2, 120025],
   },
   (movie, rev_to, user)={
     edge_index=[2, 280060],
     edge_attr=[280060],
   }
 ),
 tensor([5, 4, 5,  ..., 4, 3, 4]))

In [7]:
edge_label_index = train_data['user','to','movie'].edge_label_index
edge_label_withattr = train_data['user','to','movie'].edge_label

train_loader = LinkNeighborLoader(
    data = train_data,
    num_neighbors = [20,10],                                         ## 2-hoop
    neg_sampling_ratio = 0.2,                                       ## pos:neg = 1:0.2 (edge)    add 0 to edge_label  最好与1-5各class均等
    edge_label_index=(("user", "to", "movie"), edge_label_index),
    edge_label=edge_label_withattr,
    batch_size = 5000,                                                   ## num of pos edge in one batch
    shuffle = True
)

batch = next(iter(train_loader))

batch, batch['user','to','movie'].edge_label

(HeteroData(
   user={
     node_id=[5576],
     x=[5576, 22],
     n_id=[5576],
   },
   movie={
     node_id=[3378],
     x=[3378, 18],
     n_id=[3378],
   },
   (user, to, movie)={
     edge_index=[2, 48333],
     edge_attr=[48333],
     edge_label=[6000],
     edge_label_index=[2, 6000],
     e_id=[48333],
     input_id=[5000],
   },
   (movie, rev_to, user)={
     edge_index=[2, 78919],
     edge_attr=[78919],
     e_id=[78919],
   }
 ),
 tensor([3, 5, 5,  ..., 0, 0, 0]))

In [8]:
batch.edge_index_dict

{('user',
  'to',
  'movie'): EdgeIndex([[3175, 2564, 2419,  ..., 3163, 3884,  545],
            [   0,    0,    0,  ..., 3327, 3327, 3327]],
           sparse_size=(5576, 3378), nnz=48333, sort_order=col),
 ('movie',
  'rev_to',
  'user'): EdgeIndex([[ 329,    0, 1046,  ..., 1583, 1073, 1065],
            [   0,    0,    0,  ..., 5490, 5490, 5490]],
           sparse_size=(3378, 5576), nnz=78919, sort_order=col)}

In [9]:
batch.edge_index_dict['user','to','movie'].size()

torch.Size([2, 48333])

### Model
1. find node embeddings via gcn
2. predict link labels via aggr(userA,movieB)



In [10]:
user_x_size = data['user'].x.size()
movie_x_size = data['movie'].x.size()
metadata = train_data.metadata()

class GNN(nn.Module):             
    def __init__(self, c_in, c_h, c_out):
        super().__init__()
        self.conv1 = gnn.SAGEConv(c_in, c_h)
        self.dropout = nn.Dropout(0.1)
        self.conv2 = gnn.SAGEConv(c_h, c_out)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x

class Model(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        # self.user_linear = nn.Linear(user_x_size[1], hidden_channels)   ##   自行one-hot效果不佳
        # self.movie_linear = nn.Linear(movie_x_size[1], hidden_channels) ##   [0]num node     [1]ori_node_embd_size
        self.user_emb = nn.Embedding(user_x_size[0], hidden_channels)  
        self.movie_emb = nn.Embedding(movie_x_size[0], hidden_channels)        

        self.gnn = GNN(hidden_channels,hidden_channels,hidden_channels)
        self.gnn = gnn.to_hetero(self.gnn, metadata=metadata)     ## HeteroGNN_model = to_hetero(GNN_model, data.metadata(), aggr='sum')
        
        self.classifier_linear = nn.Linear(hidden_channels*2, 6)  ## rank:0-6
        self.classifier_bn = nn.BatchNorm1d(6)

    def forward(self, batch):
        # x_dict = {
        #   "user": self.user_linear(batch["user"].x),
        #   "movie": self.movie_linear(batch["movie"].x),
        # } 
        x_dict = {
          "user": self.user_emb(batch["user"].node_id),        ## 两种node输入HeteroGNN时 必须embed size一致
          "movie": self.movie_emb(batch["movie"].node_id),
        } 
        x_dict = self.gnn(x_dict, batch.edge_index_dict)

        [fromU_, toM_] = batch["user", "to", "movie"].edge_label_index
        userFeat = x_dict['user'][fromU_]                    ## [batch,hidden]
        movieFeat = x_dict['movie'][toM_]                    ##
        eFeat = torch.concat([userFeat, movieFeat],dim=-1)   ## [batch,hidden*2]

        x = F.relu(eFeat)
        x = self.classifier_linear(x)
        x = self.classifier_bn(x)
        x = F.softmax(x, dim=1)
        return x


# model = Model(hidden_channels=64)
# model(batch).size()

In [11]:
def train_step(model, lossfn, optimizer, batch):   
    model.train()
    optimizer.zero_grad()
    y = batch["user", "to", "movie"].edge_label
    pred = model(batch)       
    loss = lossfn(pred,y)
    loss.backward()
    optimizer.step()
    acc = (pred.argmax(dim=-1) == y).sum().float() / y.size()[0]
    print('>',end='')
    return loss.item(), acc.item()


model = Model(hidden_channels=64).to(device)
lossfn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)


for epoch in range(10):                                       ## It trapped!!!!!
    print('Epoch:{}'.format(epoch),end='')
    for batch in iter(train_loader):
        tmpLoss, tmpAcc = train_step(model, lossfn, optimizer, batch.to(device))
    print('tmpLoss:{}, tmpAcc:{}'.format(tmpLoss, tmpAcc))

Epoch:0>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.6884582042694092, tmpAcc:0.4000000059604645
Epoch:1>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.7284268140792847, tmpAcc:0.23333333432674408
Epoch:2>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.6086386442184448, tmpAcc:0.46666666865348816
Epoch:3>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.576741099357605, tmpAcc:0.5
Epoch:4>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.5663833618164062, tmpAcc:0.5
Epoch:5>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.5778851509094238, tmpAcc:0.46666666865348816
Epoch:6>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.5878255367279053, tmpAcc:0.46666666865348816
Epoch:7>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.7017165422439575, tmpAcc:0.3333333432674408
Epoch:8>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.6123733520507812, tmpAcc:0.46666666865348816
Epoch:9>>>>>>>>>>>>>>>>>>>>>>>>>tmpLoss:1.4560538530349731, tmpAcc:0.6000000238418579


In [12]:
pred = model(batch)

pred.argmax(dim=-1)

tensor([4, 4, 3, 3, 3, 5, 3, 4, 4, 3, 5, 0, 3, 1, 5, 5, 3, 5, 5, 4, 4, 4, 4, 4,
        4, 0, 4, 0, 0, 3])

In [13]:
batch["user", "to", "movie"].edge_label

tensor([4, 4, 3, 3, 3, 2, 3, 2, 4, 3, 5, 5, 3, 1, 5, 5, 3, 3, 5, 4, 4, 5, 4, 4,
        4, 0, 0, 0, 0, 0])