In [3]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import tqdm
from sklearn.metrics import roc_auc_score
from torch.nn import Embedding, Linear

import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.utils.convert import to_scipy_sparse_matrix
from torch.nn import Embedding, Linear
from torch_geometric.data import HeteroData
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Модель

In [4]:
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_src, z_dst, edge_label_index):
        row, col = edge_label_index
        src = z_src[row]
        dst = z_dst[col]
        pred = (src * dst).sum(dim = -1)
        return pred.view(-1)

class Model_2(torch.nn.Module):
    def __init__(self, num_users, num_items, hidden_channels, out_channels):
        super().__init__()
        self.user_emb = Embedding(num_users, hidden_channels, device=device)
        self.movie_emb = Embedding(num_items, hidden_channels, device=device)
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)
        self.conv3 = SAGEConv((-1, -1), hidden_channels)
        self.conv4 = SAGEConv((-1, -1), hidden_channels)
        
        self.lin1 = Linear(hidden_channels, out_channels)
        self.lin2 = Linear(hidden_channels, out_channels)
        
        self.decoder = EdgeDecoder(out_channels)
        
    
    def encoder(self, x_dict, edge_index_dict):
        # z_dict = {}
        x_dict['user'] = self.user_emb(x_dict['user']) 
        x_dict['movie'] = self.movie_emb(x_dict['movie'])
        
        
        user_1 = self.conv1(
            (x_dict['movie'], x_dict['user']),
            edge_index_dict[('movie', 'rev_to', 'user')],
        ).relu()
        
        movie_1 = self.conv2(
            (x_dict['user'], x_dict['movie']),
            edge_index_dict[('user', 'to', 'movie')],
        ).relu()
        
        user_2 = self.conv3(
            (movie_1, user_1),
            edge_index_dict[('movie', 'rev_to', 'user')],
        ).relu()
        
        movie_2 = self.conv4(
            (user_1, movie_1),
            edge_index_dict[('user', 'to', 'movie')],
        ).relu()
        
        user_3 = self.lin1(user_2)
        movie_3 = self.lin1(movie_2)
        
        return user_3, movie_3
    
    def forward(self, x_dict, edge_index_dict, edge_label_index):
        
        user_3, movie_3 = self.encoder(x_dict, edge_index_dict)
        
        return self.decoder(user_3, movie_3, edge_label_index)

In [5]:
def train():
    model.train()

    total_loss = total_examples = 0
    for batch in tqdm.tqdm(train_loader):
        batch = batch.to(device)
        optimizer.zero_grad()

        pred = model(
            batch.x_dict,
            batch.edge_index_dict,
            batch['user', 'movie'].edge_label_index,
        )

        loss = F.binary_cross_entropy_with_logits(
            pred, batch['user', 'movie'].edge_label)

        loss.backward()
        optimizer.step()
        total_loss += float(loss)
        total_examples += pred.numel()

    return total_loss / total_examples

@torch.no_grad()
def test(loader):
    model.eval()

    preds, targets = [], []
    for batch in tqdm.tqdm(loader):
        batch = batch.to(device)

        pred = model(
            batch.x_dict,
            batch.edge_index_dict,
            batch['user', 'movie'].edge_label_index,
        ).sigmoid().view(-1).cpu()
        target = batch['user', 'movie'].edge_label.long().cpu()

        preds.append(pred)
        targets.append(target)

    pred = torch.cat(preds, dim=0).numpy()
    target = torch.cat(targets, dim=0).numpy()

    return roc_auc_score(target, pred)


# Данные

In [None]:
graph_data_pd = pd.read_csv('../data/events.csv')
# print(len(graph_data_pd))
# print(graph_data_pd.nunique())


all_users = np.array(graph_data_pd['user_id'].drop_duplicates())
all_movies = np.array(graph_data_pd['movie'].drop_duplicates())

#Переводим в PyG
user_mapping = {node_id: idx for idx, node_id in enumerate(all_users)}
reverse_user_mapping = {idx: node_id for node_id, idx in user_mapping.items()}

movie_mapping = {node_id: idx for idx, node_id in enumerate(all_movies)}
reverse_movie_mapping = {idx: node_id for node_id, idx in movie_mapping.items()}

graph_data_pd['item_id'] = graph_data_pd['item_id'].map(movie_mapping)
graph_data_pd['user_id'] = graph_data_pd['user_id'].map(user_mapping)
# print(len(graph_data_pd))

In [9]:
data = HeteroData()

data['user'].x = torch.arange(0, len(all_users))
data['user'].num_nodes = len(all_users)
data['movie'].x = torch.arange(0, len(all_movies))
data['movie'].num_nodes = len(all_movies)

data['user', 'to', 'movie'].edge_index = torch.tensor(graph_data_pd[['user_id', 'merch']].values).T
# Добавляем обратную связь
data = T.ToUndirected()(data)

In [10]:
# Разбиение на уровне связей
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=1.0,
    add_negative_train_samples=False,
    edge_types=[('user', 'to', 'movie')],
    rev_edge_types=[('movie', 'rev_to', 'user')],
)(data)

In [11]:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[8, 4],
    edge_label_index=('user', 'to', 'movie'),
    neg_sampling='binary',
    batch_size=2048,
    shuffle=True,
    num_workers=16,
    drop_last=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[8, 4],
    edge_label_index=(
        ('user', 'to', 'movie'),
        val_data[('user', 'to', 'movie')].edge_label_index,
    ),
    edge_label=val_data[('user', 'to', 'movie')].edge_label,
    batch_size=2048,
    shuffle=False,
    num_workers=16,
)

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[8, 4],
    edge_label_index=(
        ('user', 'to', 'movie'),
        test_data[('user', 'to', 'movie')].edge_label_index,
    ),
    edge_label=test_data[('user', 'to', 'movie')].edge_label,
    batch_size=2048,
    shuffle=False,
    num_workers=16,
)




In [12]:
model = Model_2(
    num_users=data['user'].num_nodes,
    num_items=data['movie'].num_nodes,
    hidden_channels=64,
    out_channels=64,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
for epoch in range(1, 10):
    loss = train()
    val_auc = test(val_loader)
    test_auc = test(test_loader)

    print(f'Epoch: {epoch:02d}, Loss: {loss:4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')

100%|██████████| 349/349 [02:22<00:00,  2.44it/s]
100%|██████████| 88/88 [00:04<00:00, 20.49it/s]
100%|██████████| 88/88 [00:04<00:00, 20.32it/s]


Epoch: 01, Loss: 0.000130, Val: 0.8771, Test: 0.8773


100%|██████████| 349/349 [02:20<00:00,  2.49it/s]
100%|██████████| 88/88 [00:03<00:00, 23.68it/s]
100%|██████████| 88/88 [00:03<00:00, 23.38it/s]


Epoch: 02, Loss: 0.000108, Val: 0.8947, Test: 0.8950


100%|██████████| 349/349 [02:21<00:00,  2.47it/s]
100%|██████████| 88/88 [00:04<00:00, 21.47it/s]
100%|██████████| 88/88 [00:04<00:00, 21.32it/s]


Epoch: 03, Loss: 0.000104, Val: 0.8991, Test: 0.8993


100%|██████████| 349/349 [02:26<00:00,  2.38it/s]
100%|██████████| 88/88 [00:04<00:00, 20.66it/s]
100%|██████████| 88/88 [00:03<00:00, 25.30it/s]


Epoch: 04, Loss: 0.000103, Val: 0.9009, Test: 0.9020


100%|██████████| 349/349 [02:26<00:00,  2.39it/s]
100%|██████████| 88/88 [00:04<00:00, 20.95it/s]
100%|██████████| 88/88 [00:03<00:00, 22.16it/s]


Epoch: 05, Loss: 0.000102, Val: 0.9022, Test: 0.9031


100%|██████████| 349/349 [02:32<00:00,  2.29it/s]
100%|██████████| 88/88 [00:04<00:00, 21.57it/s]
100%|██████████| 88/88 [00:04<00:00, 20.65it/s]


Epoch: 06, Loss: 0.000101, Val: 0.9033, Test: 0.9038


100%|██████████| 349/349 [02:30<00:00,  2.32it/s]
100%|██████████| 88/88 [00:04<00:00, 19.09it/s]
100%|██████████| 88/88 [00:04<00:00, 21.78it/s]


Epoch: 07, Loss: 0.000101, Val: 0.9047, Test: 0.9053


100%|██████████| 349/349 [02:28<00:00,  2.36it/s]
100%|██████████| 88/88 [00:03<00:00, 22.04it/s]
100%|██████████| 88/88 [00:03<00:00, 25.42it/s]


Epoch: 08, Loss: 0.000100, Val: 0.9059, Test: 0.9066


100%|██████████| 349/349 [02:36<00:00,  2.23it/s]
100%|██████████| 88/88 [00:04<00:00, 18.73it/s]
100%|██████████| 88/88 [00:04<00:00, 19.14it/s]

Epoch: 09, Loss: 0.000099, Val: 0.9078, Test: 0.9083





In [14]:
user_embeddings, movie_embeddings = model.encoder({k:v.to(device) for k,v in data.x_dict.items()}\
                                                    , {k:v.to(device) for k,v in data.edge_index_dict.items()})

In [108]:
preds = (user_embeddings @ movie_embeddings.T).cpu().detach().numpy()

In [109]:
preds = np.argsort(preds)

In [111]:
vectorized_replace = np.vectorize(reverse_merch_mapping.get)
preds = vectorized_replace(preds)

In [112]:
preds

array([[ 137, 2513, 2631, ..., 2603, 1831, 2757],
       [ 137, 2513, 2631, ..., 1583, 3529,  472],
       [ 137, 2513, 2631, ..., 2603,  584, 1039],
       ...,
       [ 137, 2513, 1266, ..., 2297, 1583,  472],
       [ 137, 2513, 2631, ..., 1039, 1583, 2980],
       [ 137, 2513, 2631, ...,  472, 3529, 2297]])

In [113]:
#уже были
delete = graph_data_pd.groupby('user_id')['merch'].apply(list)

In [116]:
answer = []
for i in tqdm.tqdm(range(len(preds))):
    cur_answer = []
    match = delete.iloc[i]
    j = len(preds[0]) - 1
    while len(cur_answer) < 10:
        if preds[i, j] not in match:
            cur_answer.append(preds[i, j])
        j-=1
    answer.append(cur_answer)

100%|██████████| 6040/6040 [00:02<00:00, 2739.56it/s]


In [156]:
answer[0], answer[1]

([2602, 2732, 1304, 2630, 398, 1001, 2331, 331, 2688, 1811],
 [3529, 36, 1039, 463, 1811, 169, 3677, 1315, 2281, 3153])

In [157]:
correct_answer = []
for line in answer:
    correct_answer.append(' '.join(list(map(str, line))))

In [158]:
answer_pd =  pd.DataFrame(correct_answer, columns = ['item_id'])
answer_pd['user_id'] = answer_pd.index

In [159]:
answer_pd[['user_id', 'item_id']].to_csv('submission_gg.csv', index = False)

In [161]:
answer_pd

Unnamed: 0,item_id,user_id
0,2602 2732 1304 2630 398 1001 2331 331 2688 1811,0
1,3529 36 1039 463 1811 169 3677 1315 2281 3153,1
2,1039 2603 640 1831 3409 1560 1956 463 2732 1223,2
3,3327 1546 2768 2603 1304 3022 3529 3046 1001 2688,3
4,1039 2297 2603 3022 1583 3409 1560 2732 2646 2210,4
...,...,...
6035,584 3409 1956 3529 463 2646 2054 36 1615 3013,6035
6036,1560 2297 36 584 2402 3013 3529 2862 1337 1039,6036
6037,472 1583 2297 3409 584 36 1223 1560 3022 2402,6037
6038,1583 640 476 3473 1746 785 1543 3046 944 1640,6038
