In [1]:
from tqdm import tqdm
import random

from utils.data_utils import data_loader , corpora, corpora_tools
from utils.evaluation_utils import evaluation
from utils.functions_utils.distribution_function import CategoricalDistributionSampler
from utils.manifold.poincare_ball import PoincareBallExact
from utils.optim_tools import rsgd
from utils.embedding_utils.losses import graph_embedding_criterion, graph_community_criterion
from utils.clustering_utils.poincare_em import PoincareEM
from utils.clustering_utils.poincare_xmedoid import XMedoid
from utils.clustering_utils.poincare_kmeans import PoincareKMeans
from utils.clustering_utils.poincare_kmedoid import RiemannianKMedoids

from torch import nn


import torch
import numpy as np

from torch.utils.data import DataLoader

  _C._set_default_tensor_type(t)
INFO: Using numpy backend


#### Fixing Random Seeds

In [2]:
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [3]:
dataset = 'football'

X, Y = data_loader.load_corpus(dataset, directed=False)
print(f"Dataset {dataset} loaded")
print(f"Number of nodes: {len(X)}")

Dataset football loaded
Number of nodes: 115


In [4]:
evaluation.nmi([y[0] for y in Y.values()],Y)

1.0

### Hyperparamethers

In [5]:

# Random Wak
PATH_LEN = 10
PRECOMPUTE = 6
CONTEXT_SIZE = 5

# Embedding
EMBEDDING_DIM = 8
LEARNING_RATE = .01
EPOCHS = 10


## Data Loading

In [6]:
X, Y = data_loader.load_corpus(dataset, directed=False)


dataset_l1 = corpora.NeigbhorFlatCorpus(X, Y)

dataset_l2 = corpora.RandomContextSizeFlat(X, Y, precompute=PRECOMPUTE,path_len=PATH_LEN, context_size=CONTEXT_SIZE)

dataset_l3 = corpora_tools.from_indexable(torch.arange(0, len(X), 1).unsqueeze(-1))


dataloader_l1 = DataLoader(dataset_l1,
                           batch_size=20,
                           shuffle=True,
                           drop_last=False,
                           )

dataloader_l2 = DataLoader(dataset_l2, batch_size=20,
                           shuffle=True, collate_fn=lambda tensor_list: (torch.cat(tensor_list, 0)[:,0] , torch.cat(tensor_list, 0)[:,1]))

dataloader_l3 = DataLoader(dataset_l3,
                           batch_size=20,
                           shuffle=True,
                           drop_last=False)

100%|██████████| 115/115 [00:00<00:00, 6576.65it/s]


### Negative Sampling

In [7]:
frequency = dataset_l2.getFrequency()[:,1]
# frequency = frequency**(3/4)
frequency = frequency**(3/4)

frequency /= frequency.sum()


distribution = CategoricalDistributionSampler(frequency)
n_community = len(set([communities[0] for key, communities in Y.items()]))
print("Number of communities : ", n_community)

Number of communities :  12


### Embedding

#### First Phase

In [11]:

node_embedding = nn.Embedding(len(X), EMBEDDING_DIM, max_norm=0.999)
node_embedding.weight.data[:] = node_embedding.weight.data * 1e-2
context_embedding = nn.Embedding(len(X), EMBEDDING_DIM, max_norm=0.999)
context_embedding.weight.data[:] = context_embedding.weight.data * 1e-2

In [12]:
manifold = PoincareBallExact
optimizer_init = rsgd.RSGD(list(node_embedding.parameters()) + list(context_embedding.parameters()), LEARNING_RATE, manifold=manifold)

In [13]:
def memory_transfer(x): return x


for i in tqdm(range(EPOCHS)):
    l2 = 0
    for x, y in dataloader_l2:
        optimizer_init.zero_grad()
        pe_x = node_embedding(memory_transfer(x.long()))
        pe_y = context_embedding(memory_transfer(y.long()))
        ne = context_embedding(memory_transfer(
            distribution.sample(sample_shape=(len(x), EMBEDDING_DIM)))).detach()
        loss = graph_embedding_criterion(
            pe_x, pe_y, z=ne, manifold=manifold).sum()
        l2 += loss.item()
        loss.backward()
        optimizer_init.step()

    l1 = 0.
    
    for x, y in dataloader_l1:
        optimizer_init.zero_grad()
        pe_x = memory_transfer(node_embedding(x.long()))
        pe_y = memory_transfer(node_embedding(y.long()))
        loss = graph_embedding_criterion(pe_x, pe_y, manifold=manifold).sum()
        l1 += loss.item()
        loss.backward()
        optimizer_init.step()

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:27<00:00,  2.77s/it]


#### Second Phase

In [14]:
optimizer = rsgd.RSGD(list(node_embedding.parameters()) +
                      list(context_embedding.parameters()), LEARNING_RATE, manifold=manifold)

In [19]:
def memory_transfer(x): return x


for i in tqdm(range(EPOCHS)):

    l1 = 0.
    for x, y in dataloader_l1:

        optimizer.zero_grad()
        pe_x = node_embedding(memory_transfer(x.long()))
        pe_y = node_embedding(memory_transfer(y.long()))

        loss = graph_embedding_criterion(pe_x, pe_y, manifold=manifold).sum()
        l1 += loss.item()
        loss.backward()
        optimizer.step()

    l2 = 0.
    for x, y in dataloader_l2:
        optimizer.zero_grad()
        pe_x = memory_transfer(node_embedding(x.long()))
        pe_y = memory_transfer(context_embedding(y.long()))
        ne = context_embedding(memory_transfer(
            distribution.sample(sample_shape=(len(x), 10)))).detach()
        loss = graph_embedding_criterion(
            pe_x, pe_y, z=ne, manifold=manifold).sum()
        l2 += loss.item()
        loss.backward()
        optimizer.step()

    em_alg = PoincareEM(n_community)
    em_alg.fit(memory_transfer(node_embedding.weight.data))

    NF = em_alg.get_normalisation_coef()
    pi, mu, sigma = em_alg.get_parameters()

    pik = em_alg.get_pik(node_embedding.weight.data)

    l3 = 0.

    for x in dataloader_l3:
        optimizer.zero_grad()

        pe_x = node_embedding(memory_transfer(x[0].long()))
        wik = pik[memory_transfer(x[0].long())]
        loss = 1e-1 * graph_community_criterion(pe_x.squeeze(), wik.detach(
        ), mu.detach(), sigma.detach(), NF.detach(), manifold=manifold).sum()
        l3 += loss.item()
        loss.backward()
        optimizer.step()

output_np = np.array(node_embedding.weight.data)
output= node_embedding.weight.data

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:48<00:00,  4.90s/it]


## Clustering

### KMeans

In [23]:
kmeans = PoincareKMeans(n_clusters=n_community, random_seed=0)

centroids = kmeans.fit(output)
predicted_clusters = kmeans.predict(output)

nmi_score_kmedoids = evaluation.nmi(predicted_clusters, Y)

print("Number of communities (XMedoid):", len(set(predicted_clusters)))
print("Real Number of communities:", n_community)
print("NMI Score (XMedoid):", nmi_score_kmedoids)

Number of communities (XMedoid): 115
Real Number of communities: 12
NMI Score (XMedoid): 0.8974118393814873
