In [None]:
import pandas as pd
import networkx as nx
import joblib
import random
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("C:\\Users\\arodi\\Downloads\\Master_CSV(in).csv", low_memory=False)

original_graph = nx.Graph()
total_rows = df.shape[0]

for idx, row in df.iterrows():
    company_id = f"company_{row['Global_Id']}"
    original_graph.add_node(company_id, bipartite='company', label=row.get("Name", ""))

    for i in range(1, 51):
        col_name = f"Process_Capability_{i}"
        capability = row.get(col_name)

        if pd.notna(capability) and str(capability).strip():
            capability_node = f"pc_{capability}"
            original_graph.add_node(capability_node, bipartite='capability')
            original_graph.add_edge(company_id, capability_node)

    if idx % 1000 == 0 or idx == total_rows - 1:
        print(f"[INFO] Processed {idx + 1}/{total_rows} rows")

joblib.dump(original_graph, "original_graph.pkl")
print("Graph saved as original_graph.pkl")


In [None]:
import pandas as pd
import networkx as nx
import numpy as np
from karateclub import DeepWalk
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import time

print("[INFO] Loading CSV and constructing graph...")
df = pd.read_csv("C:\\Users\\arodi\\Downloads\\Master_CSV(in).csv",low_memory=False)
original_graph = nx.Graph()
total_rows = df.shape[0]

for idx, row in df.iterrows():
    company_id = f"company_{row['Global_Id']}"
    original_graph.add_node(company_id, bipartite='company', label=row.get("Name", ""))

    for i in range(1, 51):
        col_name = f"Process_Capability_{i}"
        capability = row.get(col_name)

        if pd.notna(capability) and str(capability).strip():
            capability_node = f"pc_{capability}"
            original_graph.add_node(capability_node, bipartite='capability')
            original_graph.add_edge(company_id, capability_node)

    if idx % 1000 == 0 or idx == total_rows - 1:
        print(f"[INFO] Processed {idx + 1}/{total_rows} rows")

print("reindex nodes")
node_list = list(original_graph.nodes())
name_to_id = {name: i for i, name in enumerate(node_list)}
id_to_name = {i: name for name, i in name_to_id.items()}

G = nx.relabel_nodes(original_graph, name_to_id)

print("deepwalk train")
start_time = time.time()

model = DeepWalk(walk_number=10, walk_length=80, dimensions=64, window_size=5, workers=4)
model.fit(G)

end_time = time.time()
print(f"training completed in {end_time - start_time:.2f} seconds.")

joblib.dump(model, "deepwalk_model.pkl")

embeddings = model.get_embedding()
embedding_dict = {id_to_name[i]: embeddings[i] for i in range(len(embeddings))}
joblib.dump(embedding_dict, "deepwalk_embeddings_dict.pkl")


In [1]:
import random
import numpy as np
import joblib
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx


embedding_dict = joblib.load("deepwalk_embeddings_dict.pkl")
all_nodes = list(embedding_dict.keys())

original_graph = joblib.load("original_graph.pkl")

all_edges = list(original_graph.edges())
random.shuffle(all_edges)
split_idx = int(0.8 * len(all_edges))
train_edges = all_edges[:split_idx]
test_edges = all_edges[split_idx:]

train_graph = nx.Graph()
train_graph.add_nodes_from(original_graph.nodes(data=True))
train_graph.add_edges_from(train_edges)

def generate_negative_edges(G, num_samples):
    neg_edges = set()
    while len(neg_edges) < num_samples:
        u, v = random.sample(all_nodes, 2)
        if not G.has_edge(u, v) and u != v:
            neg_edges.add((u, v))
    return list(neg_edges)

negative_test_edges = generate_negative_edges(original_graph, len(test_edges))

def edge_similarity(u, v, embeddings):
    return cosine_similarity([embeddings[u]], [embeddings[v]])[0][0]

y_true = []
y_score = []
#here we are doing link validation
for u, v in test_edges:
    if u in embedding_dict and v in embedding_dict:
        sim = edge_similarity(u, v, embedding_dict)
        y_true.append(1)
        y_score.append(sim)

for u, v in negative_test_edges:
    if u in embedding_dict and v in embedding_dict:
        sim = edge_similarity(u, v, embedding_dict)
        y_true.append(0)
        y_score.append(sim)

roc_auc = roc_auc_score(y_true, y_score)
ap_score = average_precision_score(y_true, y_score)

print(f"link Prediction ROC-AUC: {roc_auc:.4f}")
print(f"link Prediction Average Precision: {ap_score:.4f}")


link Prediction ROC-AUC: 0.2653
link Prediction Average Precision: 0.4389


Above was a naive attempt at using deepwalk to learn embeddings and to see if it can distinguish positive/negative edges. This was poor performance but was not expanded much off of.

As for below we are creating a graph autoencoder using the graph attention network to learn embeddings through attention. This has superior performance and is a good direction for link prediction.

In [6]:
import pandas as pd
import networkx as nx
import torch
from torch_geometric.utils import from_networkx
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.nn import GAE, GATConv
from sklearn.metrics import roc_auc_score, average_precision_score

print("build graph")
df = pd.read_csv("C:\\Users\\arodi\\Downloads\\Master_CSV(in).csv",
                 low_memory=False)
G = nx.Graph()
for _, row in df.iterrows():
    comp = f"company_{row['Global_Id']}"
    G.add_node(comp, bipartite=0)
    for i in range(1, 51):
        cap = row.get(f"Process_Capability_{i}")
        if pd.notna(cap) and cap.strip():
            pc = f"pc_{cap}"
            G.add_node(pc, bipartite=1)
            G.add_edge(comp, pc)
print(f"nodes {G.number_of_nodes()}, edges {G.number_of_edges()}")

#convert to pyg with one-hot features
print("converting data")
feat = {
    n: [1, 0] if d['bipartite'] == 0 else [0, 1]
    for n, d in G.nodes(data=True)
}
nx.set_node_attributes(G, feat, 'x')
data = from_networkx(G)
data.x = data.x.float()
print(f"x shape {data.x.shape}")
print(f"edge ind shape {data.edge_index.shape}")

print("randomlinksplit")
split = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True, 
    split_labels=True, 
    add_negative_train_samples=True,
    neg_sampling_ratio=1.0
)
train_data, val_data, test_data = split(data)

print(f"edge index shape {train_data.edge_index.shape}")
print(f"positive label {train_data.pos_edge_label_index.shape[1]}")
print(f"negative labels {train_data.neg_edge_label_index.shape[1]}")
print(f"val pos {val_data.pos_edge_label_index.shape[1]}, neg {val_data.neg_edge_label_index.shape[1]}")
print(f"test pos {test_data.pos_edge_label_index.shape[1]}, neg {test_data.neg_edge_label_index.shape[1]}")

print("GAT‑GAE model definition")
class GATEncoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAE(GATEncoder(
    in_channels=data.x.size(1),
    hidden_channels=32,
    out_channels=16,
    heads=4
)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
train_data = train_data.to(device)

print("training")
def train_one_epoch():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(
        z,
        train_data.pos_edge_label_index,
        train_data.neg_edge_label_index
    )
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(1, 201):
    loss = train_one_epoch()
    if epoch == 1 or epoch % 10 == 0:
        print(f"epoch {epoch:02d} loss {loss:.4f}")

print("validation on 10% of data")
model.eval()
with torch.no_grad():
    z = model.encode(data.x.to(device), data.edge_index.to(device))

    pos_scores = model.decoder(z,test_data.pos_edge_label_index.to(device)).view(-1).cpu()
    neg_scores = model.decoder(z,test_data.neg_edge_label_index.to(device)).view(-1).cpu()

    y_true  = torch.cat([torch.ones_like(pos_scores),torch.zeros_like(neg_scores)]).numpy()
    y_score = torch.cat([pos_scores, neg_scores]).numpy()

    roc_auc = roc_auc_score(y_true, y_score)
    ap      = average_precision_score(y_true, y_score)
    print(f"\nLink‑prediction ROC‑AUC:       {roc_auc:.4f}")
    print(f"Link‑prediction Avg Precision:  {ap:.4f}")


build graph
nodes 21212, edges 65285
converting data
x shape torch.Size([21212, 2])
edge ind shape torch.Size([2, 130570])
randomlinksplit
edge index shape torch.Size([2, 104458])
positive label 52229
negative labels 52229
val pos 6528, neg 6528
test pos 6528, neg 6528
GAT‑GAE model definition
training
epoch 01 loss 1.4469
epoch 10 loss 1.3372
epoch 20 loss 1.1706
epoch 30 loss 1.1282
epoch 40 loss 1.1104
epoch 50 loss 1.1025
epoch 60 loss 1.0976
epoch 70 loss 1.0927
epoch 80 loss 1.0815
epoch 90 loss 1.0224
epoch 100 loss 0.9999
epoch 110 loss 0.9901
epoch 120 loss 0.9806
epoch 130 loss 0.9722
epoch 140 loss 0.9638
epoch 150 loss 0.9531
epoch 160 loss 0.9401
epoch 170 loss 0.9264
epoch 180 loss 0.9062
epoch 190 loss 0.8865
epoch 200 loss 0.8755
validation on 10% of data

Link‑prediction ROC‑AUC:       0.9867
Link‑prediction Avg Precision:  0.9866
