# Link Prediction

## Preparation

In [2]:
%env NX_CUGRAPH_AUTOCONFIG=True

env: NX_CUGRAPH_AUTOCONFIG=True


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install igraph networkit pandas matplotlib seaborn networkx numpy scikit-learn tqdm ipywidgets

Collecting igraph
  Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting texttable>=1.6.2 (from igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.11.8 texttable-1.7.0


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import networkx as nx
import pickle
import random
import igraph as ig
import networkit as nk

from itertools import combinations
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

### Dataset Preparation

In [6]:
pickle_file_path = 'dataset/amazon_copurchase_graph.pickle'
with open(pickle_file_path, 'rb') as f:
    G = pickle.load(f)

print(G)

DiGraph with 259102 nodes and 1207337 edges


### Split Dataset

In [9]:
nkG = nk.nxadapter.nx2nk(G)

edges = list(G.edges())
existing_edges = set(edges)

# Sampling dengan Networkit Graph (lebih cepat)
def sample_non_edges_nk(nkG, num_samples):
    non_edges = set()
    nodes = list(G.nodes())

    while len(non_edges) < num_samples:
        u, v = random.sample(nodes, 2)
        if not nkG.hasEdge(u, v):
            non_edges.add((u, v))

    return list(non_edges)

num_samples = len(edges)
non_edges = sample_non_edges_nk(nkG, num_samples)

train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)
train_non_edges = random.sample(non_edges, len(train_edges))
test_non_edges = random.sample(non_edges, len(test_edges))

G_train = nx.Graph()
G_train.add_nodes_from(G.nodes())
G_train.add_edges_from(train_edges)

print(f"Train Edges: {len(train_edges)}, Test Edges: {len(test_edges)}")
print(f"Train Non-Edges: {len(train_non_edges)}, Test Non-Edges: {len(test_non_edges)}")

Train Edges: 965869, Test Edges: 241468
Train Non-Edges: 965869, Test Non-Edges: 241468


In [11]:
# Metrik evaluasi ranking problem
def precision_at_k(y_true, y_scores, k):
    sorted_indices = np.argsort(y_scores)[::-1]
    top_k = sorted_indices[:k]
    return np.mean(y_true[top_k])

def recall_at_k(y_true, y_scores, k):
    sorted_indices = np.argsort(y_scores)[::-1]
    top_k = sorted_indices[:k]
    return np.sum(y_true[top_k]) / np.sum(y_true)

def mean_average_precision(y_true, y_scores):
    sorted_indices = np.argsort(y_scores)[::-1]
    relevant = np.cumsum(y_true[sorted_indices])
    precision_at_i = relevant / (np.arange(len(y_true)) + 1)
    return np.sum(precision_at_i * y_true[sorted_indices]) / np.sum(y_true)

def f1_beta_at_k(y_true, y_scores, k, beta=1):
    precision_k = precision_at_k(y_true, y_scores, k)
    recall_k = recall_at_k(y_true, y_scores, k)

    if precision_k + recall_k == 0:
        return 0.0

    beta_sq = beta ** 2
    return (1 + beta_sq) * (precision_k * recall_k) / ((beta_sq * precision_k) + recall_k)



## Graph Embedding Link Prediction

In [None]:
# %pip uninstall torch pykeen
# %pip install torch --index-url https://download.pytorch.org/whl/cu126
!pip install pykeen

Collecting networkit
  Downloading networkit-11.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading networkit-11.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: networkit
Successfully installed networkit-11.1.post1


Setelah beberapa kali *tuning* dalam parameternya, model yang dibangun adalah sebagai berikut.

- Model TransE dengan embedding berdimensi 200
- Optimizer Adam dengan learning rate 0.01
- Loss function: MarginRankingLoss untuk pelatihan
- Batch size: 256 (pelatihan), 64 (evaluasi)
- Negative sampling: basic
- Regularisasi LP dengan bobot 0.01
- Dilatih selama 20 epoch di GPU

Hanya 40% data yang digunakan untuk menyingkatkan waktu pelatihan. Pembagian dataset pelatihan, pengujian, dan evaluasi adalah 70%, 15%, dan 15%.

In [None]:
import pandas as pd
import numpy as np
from pykeen.triples import TriplesFactory

# Convert edges to a suitable format for PyKEEN
triples = np.array(train_edges)

relation_placeholder = np.full((triples.shape[0], 1), "bought_with", dtype=object)
triples = np.column_stack((triples[:, 0], relation_placeholder, triples[:, 1]))
triples = triples.astype(str)

num_samples = int(len(triples) * 0.4)

tf = TriplesFactory.from_labeled_triples(triples[:num_samples], create_inverse_triples=True)

tf_train, tf_validation, tf_test = tf.split([0.7, 0.15, 0.15])

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [48724, 57952, 57953]


In [30]:
from pykeen.pipeline import pipeline

# Define and train the model
result = pipeline(
    training=tf_train,
    testing=tf_test,
    validation=tf_validation,
    model='TransE',
    epochs=30,
    model_kwargs={'embedding_dim': 200},
    optimizer='Adam',
    optimizer_kwargs={'lr': 0.01},
    loss='MarginRankingLoss',
    training_kwargs={'batch_size': 256},
    negative_sampler='basic',
    regularizer='LP',
    regularizer_kwargs={'weight': 0.01},
    evaluator_kwargs={
        'filtered': True,
        'batch_size': 64
    }
)

# Evaluate the model
result.metric_results.to_df()

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2113 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/58.0k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 300.62s seconds


Unnamed: 0,Side,Rank_type,Metric,Value
0,head,optimistic,adjusted_arithmetic_mean_rank_index,0.015806
1,tail,optimistic,adjusted_arithmetic_mean_rank_index,0.022362
2,both,optimistic,adjusted_arithmetic_mean_rank_index,0.019084
3,head,realistic,adjusted_arithmetic_mean_rank_index,0.015805
4,tail,realistic,adjusted_arithmetic_mean_rank_index,0.022361
...,...,...,...,...
220,tail,realistic,adjusted_hits_at_k,-0.000007
221,both,realistic,adjusted_hits_at_k,0.000002
222,head,pessimistic,adjusted_hits_at_k,0.000010
223,tail,pessimistic,adjusted_hits_at_k,-0.000007


In [31]:
from pykeen.evaluation import RankBasedEvaluator

evaluator = RankBasedEvaluator()
results = evaluator.evaluate(
    model=result.model,
    mapped_triples=tf_test.mapped_triples,
    batch_size=64,  # Adjust if necessary
    additional_filter_triples=[tf_train.mapped_triples, tf_validation.mapped_triples],
)

results

print(f"Hits@1: {results.get_metric('hits@1')}")
print(f"Hits@3: {results.get_metric('hits@3')}")
print(f"Hits@5: {results.get_metric('hits@5')}")
print(f"Hits@10: {results.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {results.get_metric('mean_reciprocal_rank')}")

Evaluating on cuda:0:   0%|          | 0.00/58.0k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 299.18s seconds


<pykeen.evaluation.rank_based_evaluator.RankBasedMetricResults at 0x7e915d0aba10>