In [3]:
import numpy as np
from torch_geometric.data import (
    HeteroData,
    InMemoryDataset,
    download_url,
    extract_zip,
)
import torch_geometric.transforms as T
import os
import pandas as pd
import torch
from torch.nn import Linear, Embedding
# from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import random


In [4]:
class AverageMeter():
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch()


In [5]:
movies = pd.read_csv('../data/movies.csv')[['movieId', 'genres']]
ratings = pd.read_csv('../data/ratings.csv')

In [6]:
movies.head()

Unnamed: 0,movieId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
# categorical カラムに
movie_feat_df = movies.set_index('movieId')['genres'].str.get_dummies('|')
movie_feat = torch.from_numpy(movie_feat_df.values).to(torch.float)


In [10]:
movie_feat_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193583,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
193587,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings['userId'].unique()
unique_user_id_df = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id_df.head())
print()


# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings['movieId'].unique()
unique_movie_id_df = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id_df.head())


Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4


In [15]:
# ID の生成
ratings_user_id = pd.merge(
    ratings['userId'], unique_user_id_df, on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)

ratings_movie_id = pd.merge(
    ratings['movieId'], unique_movie_id_df, on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)


In [16]:
# edge を設定
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)

In [17]:
edge_index_user_to_movie


tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])

In [21]:
data = HeteroData()

# ノードの割当
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(unique_movie_id))

# 属性の割当
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie

# 無向グラフにする。
data = T.ToUndirected()(data)


In [22]:
data

HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={
    node_id=[9724],
    x=[9742, 20]
  },
  [1m(user, rates, movie)[0m={ edge_index=[2, 100836] },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100836] }
)

In [23]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"),
    # メッセージパッシングと監視のためにトレーニングエッジが共有されなくなります．
    # その代わり， disjoint_train_ratio のエッジは，学習中のスーパービジョンでグランドトゥルースラベルとして使用されます．
    disjoint_train_ratio=0.3,  # 監視用train edge の割合
    add_negative_train_samples=True,
    neg_sampling_ratio=2
)

train_data, val_data, test_data = transform(data)


In [24]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_sizes):
        super().__init__()

        self.conv1 = SAGEConv(hidden_sizes, hidden_sizes)
        self.conv2 = SAGEConv(hidden_sizes, hidden_sizes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_sizes):
        super().__init__()

        self.movie_lin = Linear(20, hidden_sizes)
        self.user_emb = Embedding(data['user'].num_nodes, hidden_sizes)
        self.movie_emb = Embedding(data['movie'].num_nodes, hidden_sizes)

        self.gnn = GNN(hidden_sizes)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData):
        x_dict = {
            'user': self.user_emb(data['user'].node_id),
            'movie': self.movie_lin(data['movie']['x'][data['movie']['node_id']]) + self.movie_emb(data['movie'].node_id),
        }
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict['user'],
            x_dict['movie'],
            data['user', 'rates', 'movie'].edge_label_index,
        )

        return pred


model = Model(hidden_sizes=64)
print(model)


Model(
  (movie_lin): Linear(in_features=20, out_features=64, bias=True)
  (user_emb): Embedding(610, 64)
  (movie_emb): Embedding(9724, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64)
      (movie__rev_rates__user): SAGEConv(64, 64)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64)
      (movie__rev_rates__user): SAGEConv(64, 64)
    )
  )
  (classifier): Classifier()
)


In [25]:
import tqdm
import torch.nn.functional as F


def train_fn(data, model, criterion, optimizer, epoch, device):
    losses = AverageMeter()
    model.train()

    optimizer.zero_grad()
    data = data.to(device)
    labels = data['user', 'rates', 'movie'].edge_label

    output = model(data)

    loss = criterion(output, labels)
    losses.update(loss.item(), len(labels))

    loss.backward()
    optimizer.step()

    return losses.avg


def valid_fn(data, model, criterion, device):
    losses = AverageMeter()
    model.eval()

    data = data.to(device)
    labels = data['user', 'rates', 'movie'].edge_label

    with torch.no_grad():
        output = model(data)
    loss = criterion(output, labels)
    losses.update(loss.item(), len(labels))

    predictions = (torch.sigmoid(output).cpu().detach().numpy())
    return losses.avg, predictions


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)
criterion = torch.nn.BCEWithLogitsLoss()

best_loss = np.inf
for epoch in range(100):
    train_loss = train_fn(train_data, model, criterion,
                          optimizer, epoch, device)
    val_loss, preds = valid_fn(val_data, model, criterion, device)
    print(
        f'Epoch: {epoch} '
        f'Train loss: {train_loss} '
        f'Valid loss: {val_loss} '
    )
    if val_loss < best_loss:
        best_loss = val_loss
print(best_loss)

Epoch: 0 Train loss: 0.7971444129943848 Valid loss: 0.6684021949768066 
Epoch: 1 Train loss: 0.6818880438804626 Valid loss: 0.6743171215057373 
Epoch: 2 Train loss: 0.6858484148979187 Valid loss: 0.6244567036628723 
Epoch: 3 Train loss: 0.6310597658157349 Valid loss: 0.5967593193054199 
Epoch: 4 Train loss: 0.5961464643478394 Valid loss: 0.6053288578987122 
Epoch: 5 Train loss: 0.5969111919403076 Valid loss: 0.6078428626060486 
Epoch: 6 Train loss: 0.5950315594673157 Valid loss: 0.584560215473175 
Epoch: 7 Train loss: 0.5717446208000183 Valid loss: 0.55430668592453 
Epoch: 8 Train loss: 0.543819785118103 Valid loss: 0.5371070504188538 
Epoch: 9 Train loss: 0.5289663076400757 Valid loss: 0.5323325991630554 
Epoch: 10 Train loss: 0.5252511501312256 Valid loss: 0.5243747234344482 
Epoch: 11 Train loss: 0.516690194606781 Valid loss: 0.5075474977493286 
Epoch: 12 Train loss: 0.49771225452423096 Valid loss: 0.4915582835674286 
Epoch: 13 Train loss: 0.4781877100467682 Valid loss: 0.4867113828